In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv('Dataset/Students.csv')

In [7]:
df.dtypes

StudentID         int64
Name             object
Gender           object
Age               int64
MathScore       float64
EnglishScore    float64
TotalScore        int64
dtype: object

In [4]:
df.head()

Unnamed: 0,StudentID,Name,Gender,Age,MathScore,EnglishScore,TotalScore
0,1,Alice,Female,15,85.0,90.0,175
1,2,Bob,Male,16,,95.0,95
2,3,Charlie,Male,15,80.0,,80
3,4,Diana,Female,16,90.0,85.0,175
4,5,Eve,Female,15,75.0,70.0,145


In [8]:
df['MathScore'].fillna(df['MathScore'].mean(), inplace=True)
df['EnglishScore'].fillna(df['EnglishScore'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MathScore'].fillna(df['MathScore'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['EnglishScore'].fillna(df['EnglishScore'].mean(), inplace=True)


In [10]:
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

In [11]:
df['MathScore'] = df['MathScore'].astype(float)
df['EnglishScore'] = df['EnglishScore'].astype(float)

In [12]:
df['TotalScore'] = df['MathScore'] + df['EnglishScore']

In [13]:
df['TotalScore'] = df['TotalScore'].astype(float)

In [14]:
scaler = StandardScaler()
df[['MathScore', 'EnglishScore', 'TotalScore']] = scaler.fit_transform(df[['MathScore', 'EnglishScore', 'TotalScore']])

In [15]:
df.to_csv('Dataset/Cleaned_student.csv', index=False)

In [16]:
df.head()

Unnamed: 0,StudentID,Name,Gender,Age,MathScore,EnglishScore,TotalScore
0,1,Alice,0,15,0.5,0.597614,0.622841
1,2,Bob,1,16,0.0,1.195229,0.830455
2,3,Charlie,1,15,-0.5,0.0,-0.207614
3,4,Diana,0,16,1.5,0.0,0.622841
4,5,Eve,0,15,-1.5,-1.792843,-1.868523


In [17]:
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [18]:
df = pd.read_csv('Dataset/Sales.csv')

In [19]:
df.head()

Unnamed: 0,ProductID,ProductCategory,Price,QuantitySold,Revenue
0,101,Electronics,1500,5,7500
1,102,Electronics,2000,3,6000
2,103,Furniture,500,10,5000
3,104,Clothing,50,100,5000
4,105,Electronics,10000,1,10000


In [20]:
df.dtypes

ProductID           int64
ProductCategory    object
Price               int64
QuantitySold        int64
Revenue             int64
dtype: object

In [23]:
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)

IQR = Q3 - Q1

In [24]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR 

In [25]:
df_cleaned = df[(df['Price'] >= lower_bound) & (df['Price'] <= upper_bound)]

In [26]:
scaler = MinMaxScaler()
df_cleaned[['Price','Revenue']] = scaler.fit_transform(df_cleaned[['Price','Revenue']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[['Price','Revenue']] = scaler.fit_transform(df_cleaned[['Price','Revenue']])


In [27]:
df[['Price', 'Revenue']] = df_cleaned[['Price','Revenue']].round().astype(int)

In [28]:
df_cleaned.to_csv('Dataset/Cleaned_Sale.csv', index=False)

In [29]:
df_cleaned.head()

Unnamed: 0,ProductID,ProductCategory,Price,QuantitySold,Revenue,Price_Zscore
0,101,Electronics,0.747475,5,1.0,-0.199372
1,102,Electronics,1.0,3,0.571429,-0.04668
2,103,Furniture,0.242424,10,0.285714,-0.504757
3,104,Clothing,0.015152,100,0.285714,-0.642179
5,106,Clothing,0.0,200,0.0,-0.651341
