# Artificial Intelligence II: HW1 Tutorial

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

## Loading and Exploring the Dataset

In [None]:
#Reading a csv file with pandas only needs one line of code
df = pd.read_csv("./sample_data/california_housing_train.csv")

In [None]:
!gdown --id 

In [None]:
#Take a look at the first examples of the dataset
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [None]:
#Get stats about the data
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Data Pre-processing

In [None]:
#Check for null values
#In this case there are no missing values, so no action is needed
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [None]:
#Separate features (inputs) from targets (outputs)
X = df.drop('median_house_value', axis=1) #Keep everything but value
Y = df['median_house_value'] #Only keep value

In [None]:
#Split train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)

## Train a model

In [None]:
#We will experiment with Ridge Regression
clf = Ridge()
clf.fit(X_train, Y_train)

Ridge()

## Evaluate model's performance

In [None]:
#Now that we have trained the classifer, we can make predictions on the unseen data
Y_test_pred = clf.predict(X_test)
print(Y_test_pred)
#Let's also make predictions on the train set for reference
Y_train_pred = clf.predict(X_train)

[ 98793.65499353  67248.16703627 156492.12478729 ... 233439.37714666
 301745.69755585 172554.74551526]


In [None]:
test_mse = mean_squared_error(Y_test, Y_test_pred)
print(f"Our classifier achieves a MSE of {test_mse:.2f} on the test set")
train_mse = mean_squared_error(Y_train, Y_train_pred)
print(f"Our classifier achieves a MSE of {train_mse:.2f} on the train set")

Our classifier achieves a MSE of 4633771802.23 on the test set
Our classifier achieves a MSE of 4873337527.61 on the train set


## Notes and Conclusions
As we see the MSE is quite large, and eventhough our model performs slightly better on the train set (as expected), it does not perform that well overall.

However, there are many things that we have not tried yet:

* Scaling our features
* Experimenting with different models 
* Using different hyperparameters for each model
* Testing which of the features are really helpfull
* Creating additional synthetic features
* And many more...

Obviously, some of these steps might not apply to a NLP project, because the nature of textual data is very different and unique. However, the overall mindset is similar.