In [1]:
# display inline plots
%matplotlib inline

# import libraries for numerical and scientific computing
import numpy as np

# import matplotlib for plotting
import matplotlib.pyplot as plt

# import pandas for data wrangling and munging
import pandas as pd

# set some options for better view
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# import plotting library built on top of matplotlib
import seaborn as sns

# set some settings related to style of plots that will render
sns.set_style("whitegrid")
sns.set_context("poster")

import warnings
warnings.filterwarnings('ignore')



In [2]:
from utils import load_data_set, lowercase_column_names

In [3]:
train, test = load_data_set()
train, test = lowercase_column_names(train, test)

Loading datasets
Set ID as index


## Shape of the datasets

In [None]:
print 'Training data set {} and Test data set {}'.format(train.shape, test.shape)

## Pipeline

In [None]:
from features import FeatureTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [None]:
ft = FeatureTransformer(train, test)
scaler = StandardScaler()
lm = LinearRegression()

In [None]:
pipeline = Pipeline([('transformer', ft), ('scaler', scaler), ('lm', lm)])

In [None]:
X = train[train.columns.drop('salary')]
y = train.salary

In [None]:
pipeline.fit(X, y)

In [None]:
lm_comp = pipeline.get_params()['lm']

In [None]:
print 'Estimated intercept coefficient:', lm_comp.intercept_

In [None]:
print 'Number of coefficients:', len(lm_comp.coef_)

In [None]:
feature_names = pipeline.get_params()['transformer'].get_feature_names()

In [None]:
# The coefficients
pd.DataFrame(zip(feature_names, lm_comp.coef_), columns = ['features', 'estimatedCoefficients'])

In [None]:
# lets take a look at first five predicted values
pipeline.predict(X)[0:5]

In [None]:
plt.hist(pipeline.predict(X))
plt.title('Predicted Salary (fitted values): $\hat{Y}_i$')
plt.xlabel('Salary')
plt.ylabel('Frequency');

In [None]:
plt.scatter(train.salary, clf.predict(X), alpha=0.3)
plt.xlabel("Salaries: $Y_i$")
plt.ylabel("Predicted salaries: $\hat{Y}_i$")
plt.title("Salaries vs Predicted salaries: $Y_i$ vs $\hat{Y}_i$");

In [None]:
print np.sum((train.salary - clf.predict(X)) ** 2)

In [None]:
mseFull = np.mean((train.salary - clf.predict(X)) ** 2)
print mseFull

In [None]:
# see correlations in the feature space
X.corr()

In [None]:
# split into training and test examples
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=44)

In [None]:
# lets print out the shape of the dataset
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# lets build models
clf.fit(X_train, y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [None]:
print "Fit a model X_train, and calculate MSE with Y_train:", np.mean((y_train - pred_train) ** 2)
print "Fit a model X_train, and calculate MSE with X_test, Y_test:", np.mean((y_test - pred_test) ** 2)

In [None]:
plt.scatter(pred_train, pred_train - y_train, c='b', s=40, alpha=0.5)
plt.scatter(pred_test, pred_test - y_test, c='g', s=40, alpha=0.5)
plt.hlines(y = 0, xmin=11e4, xmax = 55e4)
plt.title('Residual Plot using training (blue) and test (green) data')
plt.ylabel('Residuals');

In [None]:
# k-fold cross validation
from sklearn.cross_validation import KFold, cross_val_score

kf = KFold(X_train.shape[0], n_folds=5)
scores = cross_val_score(clf, X_train, y_train, scoring='mean_squared_error', cv=kf)

In [None]:
print scores.min(), scores.mean(), scores.max()

In [None]:
# take a logarithm of the target variable and rerun the whole analysis
X = train[features]
y = np.log1p(train.salary)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=44)

In [None]:
clf.fit(X_train, y_train)
pred_train = np.expm1(clf.predict(X_train))
pred_test = np.expm1(clf.predict(X_test))

In [None]:
pred_train

In [None]:
print "Fit a model X_train, and calculate MSE with Y_train:", np.mean((np.expm1(y_train) - pred_train) ** 2)
print "Fit a model X_train, and calculate MSE with X_test, Y_test:", np.mean((np.expm1(y_test) - pred_test) ** 2)

In [None]:
plt.scatter(pred_train, pred_train - np.expm1(y_train), c='b', s=40, alpha=0.5)
plt.scatter(pred_test, pred_test - np.expm1(y_test), c='g', s=40, alpha=0.5)
plt.hlines(y = 0, xmin=11e4, xmax = 55e4)
plt.title('Residual Plot using training (blue) and test (green) data')
plt.ylabel('Residuals');

** Not much improvement in mean squared error after taking log transformation **