## Task - 02
 
### DATA PREPROCESSING
#### - Handle missing values and outliers appropriately.
#### - Normalize or scale features as needed.
#### - Split the data into training and testing sets

In [18]:
import pandas as pd

In [2]:
df = pd.read_csv('Dataset/phone_sales.csv')
df.head()

Unnamed: 0,Store,Month,Model,Units_Sold,Revenue
0,Store_A,January,iPhone,10.0,10000.0
1,Store_B,February,Samsung,,8500.0
2,Store_C,March,OnePlus,12.0,
3,Store_D,April,iPhone,,12000.0
4,Store_E,May,Samsung,15.0,13500.0


In [3]:
df.shape

(15, 5)

In [4]:
df.isnull().sum()

Store         2
Month         4
Model         2
Units_Sold    4
Revenue       2
dtype: int64

In [5]:
df.isnull().sum().sum()

14

### Handling Missing Values

In [6]:
df['Store'].fillna('Unknown', inplace=True)
df

Unnamed: 0,Store,Month,Model,Units_Sold,Revenue
0,Store_A,January,iPhone,10.0,10000.0
1,Store_B,February,Samsung,,8500.0
2,Store_C,March,OnePlus,12.0,
3,Store_D,April,iPhone,,12000.0
4,Store_E,May,Samsung,15.0,13500.0
5,Unknown,,OnePlus,14.0,15000.0
6,Store_G,July,iPhone,20.0,20000.0
7,Store_H,,Samsung,18.0,18000.0
8,Store_I,September,OnePlus,,17500.0
9,Store_J,October,iPhone,22.0,22000.0


In [7]:
df['Month'].fillna(method='pad', inplace=True)
df

  df['Month'].fillna(method='pad', inplace=True)


Unnamed: 0,Store,Month,Model,Units_Sold,Revenue
0,Store_A,January,iPhone,10.0,10000.0
1,Store_B,February,Samsung,,8500.0
2,Store_C,March,OnePlus,12.0,
3,Store_D,April,iPhone,,12000.0
4,Store_E,May,Samsung,15.0,13500.0
5,Unknown,May,OnePlus,14.0,15000.0
6,Store_G,July,iPhone,20.0,20000.0
7,Store_H,July,Samsung,18.0,18000.0
8,Store_I,September,OnePlus,,17500.0
9,Store_J,October,iPhone,22.0,22000.0


In [8]:
df['Model'].fillna(method='bfill', inplace=True)
df

  df['Model'].fillna(method='bfill', inplace=True)


Unnamed: 0,Store,Month,Model,Units_Sold,Revenue
0,Store_A,January,iPhone,10.0,10000.0
1,Store_B,February,Samsung,,8500.0
2,Store_C,March,OnePlus,12.0,
3,Store_D,April,iPhone,,12000.0
4,Store_E,May,Samsung,15.0,13500.0
5,Unknown,May,OnePlus,14.0,15000.0
6,Store_G,July,iPhone,20.0,20000.0
7,Store_H,July,Samsung,18.0,18000.0
8,Store_I,September,OnePlus,,17500.0
9,Store_J,October,iPhone,22.0,22000.0


In [9]:
df['Units_Sold'].fillna(df['Units_Sold'].mean(), inplace=True)
df

Unnamed: 0,Store,Month,Model,Units_Sold,Revenue
0,Store_A,January,iPhone,10.0,10000.0
1,Store_B,February,Samsung,19.909091,8500.0
2,Store_C,March,OnePlus,12.0,
3,Store_D,April,iPhone,19.909091,12000.0
4,Store_E,May,Samsung,15.0,13500.0
5,Unknown,May,OnePlus,14.0,15000.0
6,Store_G,July,iPhone,20.0,20000.0
7,Store_H,July,Samsung,18.0,18000.0
8,Store_I,September,OnePlus,19.909091,17500.0
9,Store_J,October,iPhone,22.0,22000.0


In [10]:
df['Revenue'].fillna(df['Revenue'].mean(), inplace=True)
df

Unnamed: 0,Store,Month,Model,Units_Sold,Revenue
0,Store_A,January,iPhone,10.0,10000.0
1,Store_B,February,Samsung,19.909091,8500.0
2,Store_C,March,OnePlus,12.0,18807.692308
3,Store_D,April,iPhone,19.909091,12000.0
4,Store_E,May,Samsung,15.0,13500.0
5,Unknown,May,OnePlus,14.0,15000.0
6,Store_G,July,iPhone,20.0,20000.0
7,Store_H,July,Samsung,18.0,18000.0
8,Store_I,September,OnePlus,19.909091,17500.0
9,Store_J,October,iPhone,22.0,22000.0


### Identifying and Handling Outliers (Using IQR method)

In [11]:
Q1 = df['Units_Sold'].quantile(0.25)
Q3 = df['Units_Sold'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Units_Sold'] < (Q1 - 1.5 * IQR)) | (df['Units_Sold'] > (Q3 + 1.5 * IQR)))]
df

Unnamed: 0,Store,Month,Model,Units_Sold,Revenue
0,Store_A,January,iPhone,10.0,10000.0
1,Store_B,February,Samsung,19.909091,8500.0
2,Store_C,March,OnePlus,12.0,18807.692308
3,Store_D,April,iPhone,19.909091,12000.0
4,Store_E,May,Samsung,15.0,13500.0
5,Unknown,May,OnePlus,14.0,15000.0
6,Store_G,July,iPhone,20.0,20000.0
7,Store_H,July,Samsung,18.0,18000.0
8,Store_I,September,OnePlus,19.909091,17500.0
9,Store_J,October,iPhone,22.0,22000.0


In [12]:
Q1 = df['Revenue'].quantile(0.25)
Q3 = df['Revenue'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Revenue'] < (Q1 - 1.5 * IQR)) | (df['Revenue'] > (Q3 + 1.5 * IQR)))]
df

Unnamed: 0,Store,Month,Model,Units_Sold,Revenue
0,Store_A,January,iPhone,10.0,10000.0
1,Store_B,February,Samsung,19.909091,8500.0
2,Store_C,March,OnePlus,12.0,18807.692308
3,Store_D,April,iPhone,19.909091,12000.0
4,Store_E,May,Samsung,15.0,13500.0
5,Unknown,May,OnePlus,14.0,15000.0
6,Store_G,July,iPhone,20.0,20000.0
7,Store_H,July,Samsung,18.0,18000.0
8,Store_I,September,OnePlus,19.909091,17500.0
9,Store_J,October,iPhone,22.0,22000.0


### Encoding Categorical Variables

In [13]:
df = pd.get_dummies(df, columns=['Store', 'Month', 'Model'])
df

Unnamed: 0,Units_Sold,Revenue,Store_Store_A,Store_Store_B,Store_Store_C,Store_Store_D,Store_Store_E,Store_Store_G,Store_Store_H,Store_Store_I,...,Month_February,Month_January,Month_July,Month_March,Month_May,Month_October,Month_September,Model_OnePlus,Model_Samsung,Model_iPhone
0,10.0,10000.0,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
1,19.909091,8500.0,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,12.0,18807.692308,False,False,True,False,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
3,19.909091,12000.0,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,15.0,13500.0,False,False,False,False,True,False,False,False,...,False,False,False,False,True,False,False,False,True,False
5,14.0,15000.0,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
6,20.0,20000.0,False,False,False,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,True
7,18.0,18000.0,False,False,False,False,False,False,True,False,...,False,False,True,False,False,False,False,False,True,False
8,19.909091,17500.0,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,True,True,False,False
9,22.0,22000.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True


### Normalizing/Scaling Features

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Units_Sold', 'Revenue']] = scaler.fit_transform(df[['Units_Sold', 'Revenue']])

### Splitting Data into Training and Testing Sets

In [15]:
X = df.drop(columns=['Revenue'])
y = df['Revenue']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Display the results
print("Training Features:\n", X_train.head())
print("\nTesting Features:\n", X_test.head())
print("\nTraining Labels:\n", y_train.head())
print("\nTesting Labels:\n", y_test.head())

Training Features:
     Units_Sold  Store_Store_A  Store_Store_B  Store_Store_C  Store_Store_D  \
13    1.478985          False          False          False          False   
5    -1.080158          False          False          False          False   
8     0.000000          False          False          False          False   
2    -1.445749          False          False           True          False   
1     0.000000          False           True          False          False   

    Store_Store_E  Store_Store_G  Store_Store_H  Store_Store_I  Store_Store_J  \
13          False          False          False          False          False   
5           False          False          False          False          False   
8           False          False          False           True          False   
2           False          False          False          False          False   
1           False          False          False          False          False   

    ...  Month_February 