# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [10]:
df = pd.read_csv('train.csv')


In [18]:
print(df.nunique().to_string())

Id               1460
MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
Street              2
Alley               2
LotShape            4
LandContour         4
Utilities           2
LotConfig           5
LandSlope           3
Neighborhood       25
Condition1          9
Condition2          8
BldgType            5
HouseStyle          8
OverallQual        10
OverallCond         9
YearBuilt         112
YearRemodAdd       61
RoofStyle           6
RoofMatl            8
Exterior1st        15
Exterior2nd        16
MasVnrType          4
MasVnrArea        327
ExterQual           4
ExterCond           5
Foundation          6
BsmtQual            4
BsmtCond            4
BsmtExposure        4
BsmtFinType1        6
BsmtFinSF1        637
BsmtFinType2        6
BsmtFinSF2        144
BsmtUnfSF         780
TotalBsmtSF       721
Heating             6
HeatingQC           5
CentralAir          2
Electrical          5
1stFlrSF          753
2ndFlrSF          417
LowQualFin

In [12]:
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])

In [14]:
print(unique_counts.to_string())

      Column_Name  Num_Unique
41     CentralAir           2
9       Utilities           2
5          Street           2
6           Alley           2
48   BsmtHalfBath           3
11      LandSlope           3
60   GarageFinish           3
50       HalfBath           3
65     PavedDrive           3
72         PoolQC           3
49       FullBath           4
25     MasVnrType           4
32   BsmtExposure           4
27      ExterQual           4
74    MiscFeature           4
47   BsmtFullBath           4
73          Fence           4
53    KitchenQual           4
31       BsmtCond           4
56     Fireplaces           4
8     LandContour           4
7        LotShape           4
52   KitchenAbvGr           4
30       BsmtQual           4
57    FireplaceQu           5
42     Electrical           5
77         YrSold           5
61     GarageCars           5
63     GarageQual           5
64     GarageCond           5
40      HeatingQC           5
28      ExterCond           5
2        M

In [19]:
df['CentralAir'].isnull().sum()

0

In [25]:
missing = pd.DataFrame.from_records([(col, df[col].isnull().sum()) for col in df.columns],
                          columns=['Column_Name', 'N_missing']).sort_values(by=['N_missing'])

In [26]:
print(missing.to_string())

      Column_Name  N_missing
0              Id          0
38    TotalBsmtSF          0
39        Heating          0
79  SaleCondition          0
41     CentralAir          0
43       1stFlrSF          0
44       2ndFlrSF          0
45   LowQualFinSF          0
46      GrLivArea          0
47   BsmtFullBath          0
48   BsmtHalfBath          0
49       FullBath          0
50       HalfBath          0
51   BedroomAbvGr          0
52   KitchenAbvGr          0
53    KitchenQual          0
54   TotRmsAbvGrd          0
55     Functional          0
78       SaleType          0
77         YrSold          0
76         MoSold          0
75        MiscVal          0
71       PoolArea          0
70    ScreenPorch          0
37      BsmtUnfSF          0
69      3SsnPorch          0
67    OpenPorchSF          0
66     WoodDeckSF          0
65     PavedDrive          0
62     GarageArea          0
61     GarageCars          0
56     Fireplaces          0
68  EnclosedPorch          0
36     BsmtFin

In [None]:
# usually features (independent variable) are first columns, the depend variables what we want to predict is usually last column
X = dataset.iloc[:, :-1] # range include the lower bound, exclude the upper bound
y = dataset.iloc[:,-1] # first starts with 0, last starts with -1

In [None]:
print (y)

## Taking care of missing data

In [None]:
 from sklearn.impute import SimpleImputer # impute is a module in the lib and SI is a class from this module
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # an object of this class: call the class with right arguments, replace by the ave of that feature, so the specific 
imputer.fit(X[:,1:3]) # apply the object on the matrix of features. fit method (function) connects imputer to matrix of feature X. so caclulate the missing val with ave. applies to numbers
X[:,1:3] = imputer.transform(X[:,1:3]) # returns the new X: to replace
# If large dataset, ignor the entire set of where there is missing data
# If a lot of missing data, you want to handle - replace by average of all data, the most frequent, the median, etc
# DS library:

In [None]:
print (X)

## Encoding categorical data

In [None]:
# turn categories to numbers: binary vector to encode the countries so without creating numerical order 
# here the numerical does not matter at all, so avoid ML to think so.. 1 hot encoding 
# for dep variable it is ok to set 0 and 1 since it is a binary outcome


### Encoding the Independent Variable

I need to learn this better

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [0])] , remainder= 'passthrough') # to keep the rest of the columns. This is object. [0] is the column to be encoded
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(y)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split #this is a funtion
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1 ) # recommend 80percent training

In [None]:
print (X_train)

## Feature Scaling

In [None]:
# not all the time, but sometimes we use it
# if you have normal distribution use normalisation all between 0 and 1
# standadisation pretty much between -3 and 3 : good for all the time always improve the training process
# feature scaling is always after splitting data... test data should behaved as unknown new data coming in the future
# we apply scaling on the test data, but don't use the test data to create the scaling sc
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# We don't apply scaling to the encoding vector for categorization we did, only to the real numerical values
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) # take all from column 3
X_test[:, 3:] = sc.transform(X_test[:, 3:]) # just replace the same scaler on the test

Questions: understand class, object, method, function
Question: understand fit, transform, how these are attached by underline
Some basic understanding of Python

In [None]:
print (X_train)

In [None]:
print (X_test)

# Simple Linear Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(y)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Simple Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
print(regressor.intercept_)

## Predicting the Test set results

In [None]:
y_pred = regressor.predict(X_test)

## Visualising the Training set results

In [None]:
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

## Visualising the Test set results

In [None]:
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()

# Multiple Linear Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(X)

## Encoding categorical data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# [3] referes to the forth column because our fourth column is categourical
X = np.array(ct.fit_transform(X))

In [None]:
print(X)
# we dont need to apply feature scaling as in multiple linear regression the coefficient will be adjusted accordingly
# We do not need to remove the extra dummy variable as the class does it automatically
# We do not need to use feature selection because the code automatically perform it.

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Polynomial Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

## Training the Linear Regression model on the whole dataset

In [None]:
dataset

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)

## Training the Polynomial Regression model on the whole dataset

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y)

## Visualising the Linear Regression results

In [None]:
plt.scatter(X, y, color = 'red')
plt.plot(X, lin_reg.predict(X), color = 'blue')
plt.title('Truth or Bluff (Linear Regression)')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()

## Visualising the Polynomial Regression results

In [None]:
plt.scatter(X, y, color = 'red')
plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue')
plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

## Visualising the Polynomial Regression results (for higher resolution and smoother curve)

In [None]:
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, lin_reg_2.predict(poly_reg.fit_transform(X_grid)), color = 'blue')
plt.title('Truth or Bluff (Polynomial Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

## Predicting a new result with Linear Regression

In [None]:
lin_reg.predict([[6.5]]) # becareful we use two sets of curely brackets
# if we use one the input will be a list for an array we need [[]]

## Predicting a new result with Polynomial Regression

In [None]:
lin_reg_2.predict(poly_reg.fit_transform([[6.5]]))

# Support Vector Regression (SVR)

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(X)

In [None]:
print(y)

In [None]:
y = y.reshape(len(y),1) # the feature selection function gets a column vector not a row vector so 
# we have to reshape it
# len(y) gives us the length of y

In [None]:
print(y)

## Feature Scaling

In [None]:
# here we need to do feature scaling on both x and y
# if we don't apply feature scaling in SVR the model does not work.
# when data are 0,1 we don't have to do it

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler() #  be careful that we use two different feature scalings to x and y
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

In [None]:
print(X)

In [None]:
print(y)

In [None]:
newy=y
newy = sc_X.fit_transform(newy) # ???

In [None]:
newy

## Training the SVR model on the whole dataset

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf') # rbf: radial basis function. this is the kernel
# there are are many other one
regressor.fit(X, y)

## Predicting a new result

In [None]:
sc_y.inverse_transform(regressor.predict(sc_X.transform([[6.5]])))
# we need to scale 6.5 in order to predict 
# we use transform model that we used to scale X
# finally we inverse the results to get the actual y

## Visualising the SVR results

In [None]:
plt.scatter(sc_X.inverse_transform(X), sc_y.inverse_transform(y), color = 'red')
plt.plot(sc_X.inverse_transform(X), sc_y.inverse_transform(regressor.predict(X)), color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

## Visualising the SVR results (for higher resolution and smoother curve)

In [None]:
X_grid = np.arange(min(sc_X.inverse_transform(X)), max(sc_X.inverse_transform(X)), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(sc_X.inverse_transform(X), sc_y.inverse_transform(y), color = 'red')
plt.plot(X_grid, sc_y.inverse_transform(regressor.predict(sc_X.transform(X_grid))), color = 'blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Decision Tree Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

## Training the Decision Tree Regression model on the whole dataset

In [None]:
# We do not need to apply scaling to decision tree regression and Random Forrest methods
# he says that because there are no equations but that does not make sense
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X, y)

## Predicting a new result

In [None]:
regressor.predict([[6.5]])

## Visualising the Decision Tree Regression results (higher resolution)

In [None]:
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (Decision Tree Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Random Forest Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

## Training the Random Forest Regression model on the whole dataset

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X, y)

## Predicting a new result

In [None]:
regressor.predict([[6.5]])

## Visualising the Random Forest Regression results (higher resolution)

In [None]:
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (Random Forest Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()