<a href="https://colab.research.google.com/github/parijit/colab_learnings/blob/main/machine_learning_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import library

In [1]:
import pandas as pd
import numpy as np
import tensorflow
import xgboost
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer #for imputation
from sklearn.compose import ColumnTransformer #for encoding categoriccal variables
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder #for encoding dependent variable labels 
from sklearn.model_selection import train_test_split #for splitting randomly
from sklearn.preprocessing import StandardScaler

# Import Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df =pd.read_csv('/content/drive/MyDrive/Machine Learning A-Z (Codes and Datasets)/Part 1 - Data Preprocessing/Section 2 -------------------- Part 1 - Data Preprocessing --------------------/Python/Data.csv')

In [4]:
x=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

In [5]:
print(x)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Missing Data treatment


In [6]:
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean' )
imputer.fit(x[:,1:3])
x[:,1:3] =imputer.transform(x[:,1:3])

In [7]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoding Categorical Data

Encoding independent variable

In [8]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder='passthrough') 
#pass through keeps other columns intact and doesnt drops them if we dont encode

In [9]:
x = np.array(ct.fit_transform(x)) # we have force fit into array since x is already an array 

In [10]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


Encoding dependent variable

In [11]:
le =LabelEncoder()
y = le.fit_transform(y)

In [12]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


Splitting Dataset

In [13]:
xtrain,xtest,ytrain,ysplit =train_test_split(x,y,test_size=0.2,random_state=1) # fix the seed so that we get values from same seed in 
#the variables so that we get the corresponding values; takes all variable randomly 

In [14]:
print(xtrain)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [15]:
print(xtest)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [16]:
print(ytrain)

[0 1 0 0 1 1 0 1]


In [17]:
print(ysplit)

[0 1]


Feature Scaling

In [18]:
sc= StandardScaler()
xtrain[:,1:3]=sc.fit_transform(xtrain[:,1:3])
xtest[:,1:3]=sc.transform(xtest[:,1:3])

In [19]:
print(xtrain)

[[0.0 -0.5773502691896258 1.2909944487358056 38.77777777777778 52000.0]
 [0.0 1.7320508075688774 -0.7745966692414834 40.0 63777.77777777778]
 [1.0 -0.5773502691896258 -0.7745966692414834 44.0 72000.0]
 [0.0 -0.5773502691896258 1.2909944487358056 38.0 61000.0]
 [0.0 -0.5773502691896258 1.2909944487358056 27.0 48000.0]
 [1.0 -0.5773502691896258 -0.7745966692414834 48.0 79000.0]
 [0.0 1.7320508075688774 -0.7745966692414834 50.0 83000.0]
 [1.0 -0.5773502691896258 -0.7745966692414834 35.0 58000.0]]


In [20]:
print(xtest)

[[0.0 1.7320508075688774 -0.7745966692414834 30.0 54000.0]
 [1.0 -0.5773502691896258 -0.7745966692414834 37.0 67000.0]]
