# Importing Libraries

In [119]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load Dataset

In [85]:
df = pd.read_csv('data.csv')

In [86]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# PreProcessing

## Handling missing data

In [87]:
num_cols = ['Age','Salary']
imp_mean = SimpleImputer(strategy='median') #you can use median, mean, constant i.e [strategy='constant', fill_value = 2]
# if you want to add an indicator to know where the change was made add [add_indicator=True] as an argument

# Create ColumnTransformer to apply imputation only for numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', imp_mean, num_cols)
    ])

# Fit and transform the data
transformed_data = preprocessor.fit_transform(df)

transformed_data

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [4.0e+01, 6.1e+04],
       [3.5e+01, 5.8e+04],
       [3.8e+01, 5.2e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])

In [88]:
# Reassemble transformed_data into a DataFrame
transformed_df = pd.DataFrame(transformed_data, columns=num_cols)

# Update the original DataFrame with transformed values
df[num_cols] = transformed_df

df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## creating unique nummerical data for labels

### Label Encoding

In [89]:
df['Country'].unique()



array(['France', 'Spain', 'Germany'], dtype=object)

In [90]:
df['Country'].value_counts()

Country
France     4
Spain      3
Germany    3
Name: count, dtype: int64

In [91]:
df['Purchased'].unique()

array(['No', 'Yes'], dtype=object)

In [92]:
df['Purchased'].value_counts()

Purchased
No     5
Yes    5
Name: count, dtype: int64

In [94]:
label_encoder = LabelEncoder()
label_encoder

In [98]:
df1 = df.copy()
df1['Country'] = label_encoder.fit_transform(df['Country'])
df1['Purchased'] = label_encoder.fit_transform(df['Purchased'])


#this will change non numerical data to numerical data
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,61000.0,1
5,0,35.0,58000.0,1
6,2,38.0,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [99]:
df1['Country'].unique()

array([0, 2, 1])

In [100]:
df1['Purchased'].unique()

array([0, 1])

### One Hot Encoding (creates a new column for each individual element in the initial data)

In [101]:
df2 = df.copy()
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [102]:
ohe = OneHotEncoder()
ohe

In [103]:
feature_arr = ohe.fit_transform(df2[['Country', 'Purchased']]).toarray()
feature_arr

array([[1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1.]])

In [104]:
feature_labels = ohe.categories_
feature_labels


flattened_labels = np.concatenate(feature_labels)

In [105]:
feature_labelss = np.array(flattened_labels).ravel()
print(feature_labelss)

['France' 'Germany' 'Spain' 'No' 'Yes']


In [106]:
# create a dataframe with feature_arr and feature_labelss
feature_df = pd.DataFrame(feature_arr, columns = feature_labelss)

In [107]:
# create a new data set
df3 = pd.concat([df2[['Age', 'Salary']], feature_df], axis=1)
df3

Unnamed: 0,Age,Salary,France,Germany,Spain,No,Yes
0,44.0,72000.0,1.0,0.0,0.0,1.0,0.0
1,27.0,48000.0,0.0,0.0,1.0,0.0,1.0
2,30.0,54000.0,0.0,1.0,0.0,1.0,0.0
3,38.0,61000.0,0.0,0.0,1.0,1.0,0.0
4,40.0,61000.0,0.0,1.0,0.0,0.0,1.0
5,35.0,58000.0,1.0,0.0,0.0,0.0,1.0
6,38.0,52000.0,0.0,0.0,1.0,1.0,0.0
7,48.0,79000.0,1.0,0.0,0.0,0.0,1.0
8,50.0,83000.0,0.0,1.0,0.0,1.0,0.0
9,37.0,67000.0,1.0,0.0,0.0,0.0,1.0


## Split Data

In [111]:
df1.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [115]:
# split into observation and target
X = df1[['Country', 'Age', 'Salary']]
y = df1['Purchased']

#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


## Feature Scalling (Standardization/normalization)

### Train

In [120]:
scaler = StandardScaler().fit(X_train)

In [122]:
print(scaler)

StandardScaler()


In [124]:
scaler.mean_
#mean for every column under the dataset

array([8.57142857e-01, 4.02857143e+01, 6.60000000e+04])

In [126]:
# get the varience
scaler.scale_

array([8.32993128e-01, 7.34291273e+00, 1.15387546e+04])

In [129]:
#transform
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[ 0.17149859, -0.03891021, -0.43332233],
       [-1.02899151,  0.50583275,  0.51998679],
       [ 1.37198868, -0.31128169, -0.43332233],
       [ 1.37198868, -1.80932482, -1.55996038],
       [-1.02899151,  1.0505757 ,  1.12663805],
       [ 0.17149859,  1.32294718,  1.47329592],
       [-1.02899151, -0.71983891, -0.69331573]])

In [131]:
X_train_scaled.mean(axis=0)
#its mean should be 0 if its standardized

array([0.0000000e+00, 1.4274296e-16, 0.0000000e+00])

In [133]:
X_train_scaled.std(axis=0)
#std should be 1

array([1., 1., 1.])

### Test 

In [134]:
scaler1 = StandardScaler().fit(X_test)

In [135]:
scaler1.mean_
#mean for every column under the dataset

array([1.00000000e+00, 3.50000000e+01, 5.76666667e+04])

In [136]:
# get the varience
scaler1.scale_

array([8.16496581e-01, 3.55902608e+00, 6.64997911e+03])

In [138]:
#transform
X_test_scaled = scaler1.transform(X_test)
X_test_scaled

array([[ 0.        , -1.40487872, -0.55138018],
       [-1.22474487,  0.56195149,  1.40351318],
       [ 1.22474487,  0.84292723, -0.852133  ]])

In [139]:
X_test_scaled.mean(axis=0)
#its mean should be 0 if its standardized

array([0.00000000e+00, 0.00000000e+00, 4.07081776e-16])

In [140]:
X_test_scaled.std(axis=0)
#std should be 1

array([1., 1., 1.])

# Simple Linear Regression