In [1]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler  
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.manifold import TSNE
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
import mglearn

### Question 1
Load the telco data from Lab 3, and perform a train-test split. Now scale the data, using StandardScaler. To do this, you fit the scaler to the train data and use that to transform both the train and test data. Now build the MLP model, and print the results. How do they compare with the MLP results without scaling (from Lab 3)?

In [2]:
df = pd.read_csv("telco.csv")
X = df.drop(['Churn'], axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
mlp = MLPClassifier(random_state=8)
mlp.fit(X_train_scaled, y_train)
print("Accuracy on training set: {:.3f}".format(mlp.score(X_train_scaled, y_train)))
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test_scaled, y_test)))
print('Test accuracy without using scaler: 0.783 (from last session)')

Accuracy on training set: 0.867
Accuracy on test set: 0.780
Test accuracy without using scaler: 0.783 (from last session)




### Question 2
Read the HomesSoldHellerup.csv file, using the pandas read_csv method (note that the separator is a semicolon and not a comma, which is the default). How many different values are there for the Type feature? What about Type of Sale?

In [4]:
df = pd.read_csv('HomesSoldHellerup.csv', sep = ';')
print(df.value_counts('Type'))
print(df.value_counts('Type of Sale'))

Type
Lejlighed          1237
Villa               742
Rækkehus            174
Stuehus               3
Døgninstitution       2
Erhverv               2
dtype: int64
Type of Sale
Alm. Salg    1978
Fam. Salg     109
Andet          52
Auktion        21
dtype: int64


### Question 3
Assign to X a list of the features, m2 and Build Year. The target value y is Price. Perform a test-train split, and then create a Linear Regression model. What results do you get? 

In [5]:
X = df[['m2', 'Build Year']]
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y)
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(lr.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(lr.score(X_test, y_test)))

Accuracy on training set: 0.346
Accuracy on test set: 0.259


### Question 4

Try a Random Forest regressor on this data, with 30 estimators. What are the results here?

In [6]:
rfr = RandomForestRegressor(n_estimators = 30)
rfr.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rfr.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rfr.score(X_test, y_test)))

Accuracy on training set: 0.930
Accuracy on test set: 0.618


### Question 5
Assign to X the features m2, Build Year and Type. Create dummy values for X, using the get_dummies method. How many features are there now?

In [7]:
X = df[['m2', 'Build Year', 'Type']]
X_d = pd.get_dummies(X)
X_d.shape

(2160, 8)

### Question 6
Perform test-train split again using the dummy features, and then do another Linear Regression. What results do you get? 

In [8]:
y_new = df['Price'] #to avoid splitting y again
X_train, X_test, y_train, y_test = train_test_split(X_d, y_new)
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(lr.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(lr.score(X_test, y_test)))

Accuracy on training set: 0.319
Accuracy on test set: 0.366


### Question 7
Try Random Forest regression using the dummy features, and report those results.

In [9]:
rfr = RandomForestRegressor(n_estimators = 30)
rfr.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rfr.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rfr.score(X_test, y_test)))

Accuracy on training set: 0.936
Accuracy on test set: 0.639


### Question 8 (optional)
Try adding some additional features from the original HomesSoldHellerup data, converting them to dummies if necessary. What other features have predictive value? Find the best subset of features you can, and try various settings of hyperparameters. What is the best result you can get? (You can test this with Linear Regression or Random Forest)