In [1]:
import pandas as pd
# Load dataset
data = pd.read_csv("Wholesale customers data.csv")



In [2]:
# Display basic info
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB
None
   Channel  Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen
0        2       3  12669  9656     7561     214              2674        1338
1        2       3   7057  9810     9568    1762              3293        1776
2        2       3   6353  8808     7684    2405              3516        7844
3        1       3  13265  1196     4221    6404               507        1788
4      

#### Step 2: Dataset Schema and Storage

In [3]:
# Define schema; The schema dictionary specifies whether each feature is numerical or categorical. 
#This helps organize the dataset and assists in preprocessing.
schema = {
    "Channel": "Categorical",
    "Region": "Categorical",
    "Fresh": "Numerical",
    "Milk": "Numerical",
    "Grocery": "Numerical",
    "Frozen": "Numerical",
    "Detergents_Paper": "Numerical",
    "Delicassen": "Numerical",
}
print(schema)

# Save as Parquet.Converts the DataFrame to a Parquet file, which is more efficient for storage and retrieval than CSV.
data.to_parquet("wholesale_customers.parquet", index=False)


{'Channel': 'Categorical', 'Region': 'Categorical', 'Fresh': 'Numerical', 'Milk': 'Numerical', 'Grocery': 'Numerical', 'Frozen': 'Numerical', 'Detergents_Paper': 'Numerical', 'Delicassen': 'Numerical'}


#### Step 3: Profiling the Dataset

In [4]:
data.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
5,2,3,9413,8259,5126,666,1795,1451
6,2,3,12126,3199,6975,480,3140,545
7,2,3,7579,4956,9426,1669,3321,2566
8,1,3,5963,3648,6192,425,1716,750
9,2,3,6006,11093,18881,1159,7425,2098


In [11]:
from ydata_profiling import ProfileReport
import pandas as pd

# Load your dataset
data = pd.read_csv("Wholesale customers data.csv")

# Generate the profile
profile = ProfileReport(data, title="Wholesale Customers Data Profile")

# Display in notebook (works without saving)
profile.to_notebook_iframe()

# Save to HTML file
profile.to_file("Wholesale customers profile.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
!pip install ydata_profiling

In [12]:
profile.to_notebook_iframe()  # This might still throw the widget error.
profile.to_file("Wholesale customers profile.html")  # Safely generates the HTML file.


Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Step 4: Train-Test Split

In [14]:
from sklearn.model_selection import train_test_split #Imports the train_test_split function for splitting the dataset.

# Split dataset
train, test = train_test_split(data, test_size=0.4, random_state=42) #Splits the data into training (60%) and testing (40%) datasets. The random_state ensures reproducibility.
#Further splits the testing data into test (20%) and production (20%) datasets.
test, prod = train_test_split(test, test_size=0.5, random_state=42)

# Each split is saved in Parquet format for storage.
train.to_parquet("train.parquet", index=False)
test.to_parquet("test.parquet", index=False)
prod.to_parquet("prod.parquet", index=False)


#### Step 5: Data Version Control

In [2]:
import subprocess

# Check the current branch
subprocess.run(['git', 'branch'])

# Push to the appropriate branch (use 'master' or 'main' based on your branch)
subprocess.run(['git', 'push', '-u', 'origin', 'master'])  # If you're on 'master'
# subprocess.run(['git', 'push', '-u', 'origin', 'main'])  # Uncomment if you're using 'main'


CompletedProcess(args=['git', 'push', '-u', 'origin', 'master'], returncode=0)

In [3]:
subprocess.run(['git', 'branch', '-m', 'master', 'main'])  # Rename 'master' to 'main'

# Push the renamed branch to GitHub
subprocess.run(['git', 'push', '-u', 'origin', 'main'])


CompletedProcess(args=['git', 'push', '-u', 'origin', 'main'], returncode=1)

In [4]:
subprocess.run(['git', 'fetch'])
subprocess.run(['git', 'branch', '-r'])


CompletedProcess(args=['git', 'branch', '-r'], returncode=0)

In [5]:
subprocess.run(['git', 'status'])


CompletedProcess(args=['git', 'status'], returncode=0)

In [6]:
subprocess.run(['git', 'pull'])


CompletedProcess(args=['git', 'pull'], returncode=0)

In [7]:
import subprocess

# Navigate to your project directory
project_dir = r"C:\Users\lavan\Documents\CT1\CT1 Assignment"

# Run 'git init'
subprocess.run(["git", "init"], cwd=project_dir)


CompletedProcess(args=['git', 'init'], returncode=0)

In [8]:
subprocess.run(["git", "add", "."], cwd=project_dir)


CompletedProcess(args=['git', 'add', '.'], returncode=0)

In [9]:
commit_message = "Added dataset splits and profile report"
subprocess.run(["git", "commit", "-m", commit_message], cwd=project_dir)


CompletedProcess(args=['git', 'commit', '-m', 'Added dataset splits and profile report'], returncode=0)

In [10]:
repo_url = "https://github.com/username/repo.git"  # Replace with your repo URL
subprocess.run(["git", "remote", "add", "origin", repo_url], cwd=project_dir)


CompletedProcess(args=['git', 'remote', 'add', 'origin', 'https://github.com/username/repo.git'], returncode=0)

In [11]:
subprocess.run(["git", "push", "-u", "origin", "main"], cwd=project_dir)


CompletedProcess(args=['git', 'push', '-u', 'origin', 'main'], returncode=1)

In [12]:
import subprocess

# Define project directory and repo URL
project_dir = r"C:\Users\lavan\Documents\CT1\CT1 Assignment"
repo_url = "https://github.com/username/repo.git"  # Replace with your repo URL
commit_message = "Added dataset splits and profile report"

# Step 1: Initialize Git repository
subprocess.run(["git", "init"], cwd=project_dir)

# Step 2: Add files to staging area
subprocess.run(["git", "add", "."], cwd=project_dir)

# Step 3: Commit changes
subprocess.run(["git", "commit", "-m", commit_message], cwd=project_dir)

# Step 4: Add remote repository
subprocess.run(["git", "remote", "add", "origin", repo_url], cwd=project_dir)

# Step 5: Push changes to remote repository
subprocess.run(["git", "push", "-u", "origin", "main"], cwd=project_dir)


CompletedProcess(args=['git', 'push', '-u', 'origin', 'main'], returncode=1)

In [1]:
import os

# Define the project directory
project_dir = r"C:\Users\lavan\Documents\CT1\CT1 Assignment"




In [2]:
# Check if `.git` directory exists
if os.path.exists(os.path.join(project_dir, ".git")):
    print("Git is initialized in this directory.")
else:
    print("Git is NOT initialized in this directory.")

Git is initialized in this directory.


In [13]:
!git add .
!git commit -m "Your commit message"
!git push origin main




On branch master
nothing to commit, working tree clean


error: src refspec main does not match any
error: failed to push some refs to 'https://github.com/username/repo.git'


In [None]:
ls -a


In [None]:
git init
!git status



In [None]:
git add .
git commit -m "Added dataset splits and profile report"
git remote add origin https://github.com/nirvaan18/ct1assignment.git
git branch -M main
git push -u origin main