## 6.4 Machine Learning Part 1

### This script contains the following:

#### 1. Importing libraries and data
#### 2. Data cleaning
#### 3. Data prep for regression analysis
#### 4. Regression analysis
#### Bonus: performance improvement after removing outliers

### 1. Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# This option ensures that the graphs you create are displayed within the notebook without the need to "call" them specifically.

%matplotlib inline

In [3]:
path = 'C:/Users/asus/Documents/Career Foundry/Achievement 6'

In [4]:
df = pd.read_csv(os.path.join(path, '6.2 Images & Assets', 'Data', 'real_estate.csv'))

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/asus/Documents/Career Foundry/Achievement 6\\6.2 Images & Assets\\Data\\real_estate.csv'

### 2. Data cleaning

In [None]:
df.columns

In [None]:
df.head(20)

In [None]:
df.shape

In [None]:
# Rename the columns in the same way as in Exercise 6.2.

df.rename(columns = {'X1 transaction date' : 'Date', 'X2 house age': 'House age', 'X3 distance to the nearest MRT station': 'D. metro', 'X4 number of convenience stores': 'N. shops', 'Y house price of unit area': 'Price per unit', 'X5 latitude' : 'Latitude', 'X6 longitude' : 'Longitude' }, inplace = True)

##### Missing values

In [None]:
# Check for missing values

df.isnull().sum()

# No missing values to handle

##### Duplicates check

In [None]:
dups = df.duplicated()

In [None]:
dups.shape # No dups

##### Extreme values checks

In [None]:
sns.distplot(df['Price per unit'], bins=25) # Shows extreme values

# Some of the outliers in the high values may need to be removed, but let's try without doing so first.

In [None]:
df['Price per unit'].mean() # Check the mean

In [None]:
df['Price per unit'].median() # Check the median

In [None]:
df['Price per unit'].max() # Check the max

### 3. Data prep for regression analysis 

In [None]:
# Create a scatterplot using matplotlib for another look at how the chosen variables plot against each other.

df.plot(x = 'D. metro', y='Price per unit',style='o') # The style option creates a scatterplot; without it, we only have lines.
plt.title('Distance to metro vs Price per unit')  
plt.xlabel('D. metro')  
plt.ylabel('Price per unit')  
plt.show()

In [None]:
# Reshape the variables into NumPy arrays and put them into separate objects.

X = df['D. metro'].values.reshape(-1,1)
y = df['Price per unit'].values.reshape(-1,1)

In [None]:
X

In [None]:
y

In [None]:
# Split data into a training set and a test set.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### 4. Regression analysis

In [None]:
# Create a regression object.

regression = LinearRegression()  # This is the regression object, which will be fit onto the training set.

In [None]:
# Fit the regression object onto the training set.

regression.fit(X_train, y_train)

In [None]:
# Predict the values of y using X.

y_predicted = regression.predict(X_test)

In [None]:
# Create a plot that shows the regression line from the model on the test set.

plot_test = plt
plot_test.scatter(X_test, y_test, color='gray', s = 15)
plot_test.plot(X_test, y_predicted, color='red', linewidth =3)
plot_test.title('Distance from metro vs Price per unit (Test set)')
plot_test.xlabel('Distance from metro')
plot_test.ylabel('Price per unit')
plot_test.show()

In [None]:
# Create objects that contain the model summary statistics.

rmse = mean_squared_error(y_test, y_predicted) # This is the mean squared error
r2 = r2_score(y_test, y_predicted) # This is the R2 score. 

In [None]:
# Print the model summary statistics. This is where you evaluate the performance of the model.

print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

In [None]:
y_predicted

In [None]:
# Create a dataframe comparing the actual and predicted values of y.

data = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_predicted.flatten()})
data.head(30)

##### Compare how the regression fits the training set

In [None]:
# Predict.

y_predicted_train = regression.predict(X_train) # This is predicting X_train!

In [None]:
rmse = mean_squared_error(y_train, y_predicted_train)
r2 = r2_score(y_train, y_predicted_train)

In [None]:
print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

In [None]:
# Visualizing the training set results.

plot_test = plt
plot_test.scatter(X_train, y_train, color='green', s = 15)
plot_test.plot(X_train, y_predicted_train, color='red', linewidth =3)
plot_test.title('Distance from metro vs Price per unit (Train set)')
plot_test.xlabel('Distance from metro')
plot_test.ylabel('Price per unit')
plot_test.show()

### Bonus: performance improvement after removing outliers

In [None]:
# Clean the extreme values from the "Price per Unit" variable observed during the consistency checks.

df_test = df[df['Price per unit'] <=100] 

In [None]:
# See how the scatterplot looks without outliers.

df_test.plot(x = 'D. metro', y='Price per unit', style='o')  
plt.title('Distance to metro vs Price per unit')  
plt.xlabel('D. metro')  
plt.ylabel('Price per unit')  
plt.show()

In [None]:
# Reshape again.

X_2 = df_test['D. metro'].values.reshape(-1,1)
y_2 = df_test['Price per unit'].values.reshape(-1,1)

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.3, random_state=0)

In [None]:
# Run and fit the regression.

regression = LinearRegression()  
regression.fit(X_train_2, y_train_2)

In [None]:
# Predict.

y_predicted_2 = regression.predict(X_test_2)

In [None]:
rmse = mean_squared_error(y_test_2, y_predicted_2)
r2 = r2_score(y_test_2, y_predicted_2)

In [None]:
print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

In [None]:
# Visualizing the test set results.
plot_test = plt
plot_test.scatter(X_test_2, y_test_2, color='gray', s = 15)
plot_test.plot(X_test_2, y_predicted_2, color='red', linewidth =3)
plot_test.title('Distance from metro vs Price per unit (Test set)')
plot_test.xlabel('Distance from metro')
plot_test.ylabel('Price per unit')
plot_test.show()

In [None]:
data = pd.DataFrame({'Actual': y_test_2.flatten(), 'Predicted': y_predicted_2.flatten()})
data.head(30)