Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
nithyadurai87 committed Nov 10, 2018
0 parents commit 66d1014
Show file tree
Hide file tree
Showing 24 changed files with 3,469 additions and 0 deletions.
13 changes: 13 additions & 0 deletions 01_stats_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import matplotlib.pyplot as plt

x = [[6], [8], [10], [14], [18], [21]]
y = [[7], [9], [13], [17.5], [18], [21]]

plt.figure()
plt.title('Pizza Price statistics')
plt.xlabel('Diameter')
plt.ylabel('dollar price')
plt.plot(x,y,'.')
plt.axis([0,25,0,25])
plt.grid(True)
plt.show()
20 changes: 20 additions & 0 deletions 02_simple_linear_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

x = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]

model = LinearRegression()
model.fit(x,y)

plt.figure()
plt.title('Pizza Price Predictions')
plt.xlabel('Diameter')
plt.ylabel('dollar price')
plt.plot(x,y,'.')
plt.plot(x,model.predict(x),'--')
plt.axis([0,25,0,25])
plt.grid(True)
plt.show()

print (model.predict([[21]]))
14 changes: 14 additions & 0 deletions 03_loss_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from sklearn.linear_model import LinearRegression
import numpy as np
from numpy.linalg import inv,lstsq
from numpy import dot, transpose

x = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]

model = LinearRegression()
model.fit(x,y)

print ('Residual sum of squares = ',np.mean((model.predict(x)- y) ** 2))
print ('Variance = ',np.var([6, 8, 10, 14, 18], ddof=1))
print ('Co-variance = ',np.cov([6, 8, 10, 14, 18], [7, 9, 13, 17.5, 18])[0][1])
14 changes: 14 additions & 0 deletions 04_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from sklearn.linear_model import LinearRegression
import numpy as np
from numpy.linalg import inv,lstsq
from numpy import dot, transpose

x = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]
model = LinearRegression()
model.fit(x,y)

x_test = [[8], [9], [11], [16], [12]]
y_test = [[11], [8.5], [15], [18], [11]]

print ('R-squared score = ',model.score(x_test, y_test))
21 changes: 21 additions & 0 deletions 05_multiple_linear_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from sklearn.linear_model import LinearRegression
from numpy.linalg import lstsq

x = [[6, 2], [8, 1], [10, 0], [14, 2], [18, 0]]
y = [[7], [9], [13], [17.5], [18]]
model = LinearRegression()
model.fit(x,y)

x1 = [[8, 2], [9, 0], [11, 2], [16, 2], [12, 0]]
y1 = [[11], [8.5], [15], [18], [11]]

predictions = model.predict(x1)
for i, prediction in enumerate(predictions):
print ((prediction, y1[i]))

print (lstsq(x, y, rcond=None)[0])

print ('R-squared score = ',model.score(x1, y1))



1,461 changes: 1,461 additions & 0 deletions 06_input_data.csv

Large diffs are not rendered by default.

1,461 changes: 1,461 additions & 0 deletions 06_output_data.csv

Large diffs are not rendered by default.

70 changes: 70 additions & 0 deletions 06_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pandas as pd

# data can be downloaded from the url: https://www.kaggle.com/vikrishnan/boston-house-prices
df = pd.read_csv('./06_input_data.csv')

# Understanding data
print (df.shape)
print (df.columns)
print(df.head(5))
print(df.info())
print(df.describe())
print(df.groupby('LotShape').size())

# Dropping null value columns which cross the threshold
a = df.isnull().sum()
print (a)
b = a[a>(0.05*len(a))]
print (b)
df = df.drop(b.index, axis=1)
print (df.shape)

# Replacing null value columns (text) with most used value
a1 = df.select_dtypes(include=['object']).isnull().sum()
print (a1)
print (a1.index)
for i in a1.index:
b1 = df[i].value_counts().index.tolist()
print (b1)
df[i] = df[i].fillna(b1[0])

# Replacing null value columns (int, float) with most used value
a2 = df.select_dtypes(include=['integer','float']).isnull().sum()
print (a2)
b2 = a2[a2!=0].index
print (b2)
df = df.fillna(df[b2].mode().to_dict(orient='records')[0])

# Creating new columns from existing columns
print (df.shape)
a3 = df['YrSold'] - df['YearBuilt']
b3 = df['YrSold'] - df['YearRemodAdd']
df['Years Before Sale'] = a3
df['Years Since Remod'] = b3
print (df.shape)

# Dropping unwanted columns
df = df.drop(["Id", "MoSold", "SaleCondition", "SaleType", "YearBuilt", "YearRemodAdd"], axis=1)
print (df.shape)

# Dropping columns which has correlation with target less than threshold
target='SalePrice'
x = df.select_dtypes(include=['integer','float']).corr()[target].abs()
print (x)
df=df.drop(x[x<0.4].index, axis=1)
print (df.shape)

# Checking for the necessary features after dropping some columns
l1 = ["PID","MS SubClass","MS Zoning","Street","Alley","Land Contour","Lot Config","Neighborhood","Condition 1","Condition 2","Bldg Type","House Style","Roof Style","Roof Matl","Exterior 1st","Exterior 2nd","Mas Vnr Type","Foundation","Heating","Central Air","Garage Type","Misc Feature","Sale Type","Sale Condition"]
l2 = []
for i in l1:
if i in df.columns:
l2.append(i)

# Getting rid of nominal columns with too many unique values
for i in l2:
len(df[i].unique())>10
df=df.drop(i, axis=1)
print (df.columns)

df.to_csv('06_output_data.csv',index=False)
44 changes: 44 additions & 0 deletions 07_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from math import sqrt
import os

df = pd.read_csv('./06_output_data.csv')

i = list(df.columns.values)
i.pop(i.index('SalePrice'))
df0 = df[i+['SalePrice']]
df = df0.select_dtypes(include=['integer','float'])
print (df.columns)

X = df[list(df.columns)[:-1]]
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y)
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_predictions = regressor.predict(X_test)

meanSquaredError=mean_squared_error(y_test, y_predictions)
rootMeanSquaredError = sqrt(meanSquaredError)

print("Number of predictions:",len(y_predictions))
print("Mean Squared Error:", meanSquaredError)
print("Root Mean Squared Error:", rootMeanSquaredError)
print ("Scoring:",regressor.score(X_test, y_test))

plt.plot(y_predictions,y_test,'r.')
plt.plot(y_predictions,y_predictions,'k-')
plt.title('Parity Plot - Linear Regression')
plt.show()

plot = plt.scatter(y_predictions, (y_predictions - y_test), c='b')
plt.hlines(y=0, xmin= 100000, xmax=400000)
plt.title('Residual Plot - Linear Regression')
plt.show()

joblib.dump(regressor, './07_output_salepricemodel.pkl')
Binary file added 07_output_salepricemodel.pkl
Binary file not shown.
13 changes: 13 additions & 0 deletions 08_input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"OverallQual":[7],
"TotalBsmtSF":[856],
"1stFlrSF":[856],
"GrLivArea":[1710],
"FullBath":[2],
"TotRmsAbvGrd":[8],
"Fireplaces":[0],
"GarageCars":[2],
"GarageArea":[548],
"Years Before Sale":[5],
"Years Since Remod":[5]
}
10 changes: 10 additions & 0 deletions 08_predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import os
import json
import pandas as pd
import numpy
from sklearn.externals import joblib

s = pd.read_json('./08_input.json')
p = joblib.load("./07_output_salepricemodel.pkl")
r = p.predict(s)
print (str(r))
31 changes: 31 additions & 0 deletions 09_flask_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import json
import pandas as pd
import numpy
from flask import Flask, render_template, request, jsonify
from pandas.io.json import json_normalize
from sklearn.externals import joblib

app = Flask(__name__)
port = int(os.getenv('PORT', 5500))

@app.route('/')
def home():
return render_template('index.html')

@app.route('/api/salepricemodel', methods=['POST'])
def salepricemodel():
if request.method == 'POST':
try:
post_data = request.get_json()
json_data = json.dumps(post_data)
s = pd.read_json(json_data)
p = joblib.load("./07_output_salepricemodel.pkl")
r = p.predict(s)
return str(r)

except Exception as e:
return (e)

if __name__ == '__main__':
app.run(host='0.0.0.0', port=port, debug=True)
102 changes: 102 additions & 0 deletions 10_model_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from math import sqrt
import numpy as np
import os

df = pd.read_csv('./06_output_data.csv')

i = list(df.columns.values)
i.pop(i.index('SalePrice'))
df0 = df[i+['SalePrice']]
df = df0.select_dtypes(include=['integer','float'])

X = df[list(df.columns)[:-1]]
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y)

def linear():
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def ridge():
regressor = Ridge(alpha=.3, normalize=True)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def lasso():
regressor = Lasso(alpha=0.00009, normalize=True)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def elasticnet():
regressor = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def randomforest():
regressor = RandomForestRegressor(n_estimators=15,min_samples_split=15,criterion='mse',max_depth=None)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print("Selected Features for RamdomForest",regressor.feature_importances_)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def perceptron():
regressor = MLPRegressor(hidden_layer_sizes=(5000,), activation='relu', solver='adam', max_iter=1000)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print("Co-efficients of Perceptron",regressor.coefs_)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def decisiontree():
regressor = DecisionTreeRegressor(min_samples_split=30,max_depth=None)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print("Selected Features for DecisionTrees",regressor.feature_importances_)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def adaboost():
regressor = AdaBoostRegressor(random_state=8, loss='exponential').fit(X_train, y_train)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print("Selected Features for Adaboost",regressor.feature_importances_)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def extratrees():
regressor = ExtraTreesRegressor(n_estimators=50).fit(X_train, y_train)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print("Selected Features for Extratrees",regressor.feature_importances_)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

def gradientboosting():
regressor = GradientBoostingRegressor(loss='ls',n_estimators=500, min_samples_split=15).fit(X_train, y_train)
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print("Selected Features for Gradientboosting",regressor.feature_importances_)
return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))

print ("Score, RMSE values")
print ("Linear = ",linear())
print ("Ridge = ",ridge())
print ("Lasso = ",lasso())
print ("ElasticNet = ",elasticnet())
print ("RandomForest = ",randomforest())
print ("Perceptron = ",perceptron())
print ("DecisionTree = ",decisiontree())
print ("AdaBoost = ",adaboost())
print ("ExtraTrees = ",extratrees())
print ("GradientBoosting = ",gradientboosting())

21 changes: 21 additions & 0 deletions 11_MV_DV_correlation.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z
G,1,-0.52,-0.5,-0.19,-0.11,-0.42,0.78,0.53,0.52,-0.16,-0.15,-0.08,-0.44,0.33,0.39,-0.46,-0.08,0.2,0.57,0.76
H,-0.52,1,0.99,0.57,0.85,0.98,-0.77,-0.3,-1,-0.41,0.69,0.72,0.78,-0.6,-0.68,-0.07,0.22,0.58,0.06,-0.64
I,-0.5,0.99,1,0.66,0.87,0.99,-0.74,-0.23,-0.99,-0.36,0.74,0.78,0.76,-0.61,-0.67,-0.03,0.29,0.61,0.09,-0.63
J,-0.19,0.57,0.66,1,0.71,0.68,-0.23,0.46,-0.57,0.01,0.88,0.87,0.38,-0.48,-0.38,0.12,0.77,0.71,0.45,-0.33
K,-0.11,0.85,0.87,0.71,1,0.92,-0.34,0.13,-0.85,-0.52,0.8,0.89,0.62,-0.48,-0.54,-0.35,0.33,0.92,0.52,-0.22
L,-0.42,0.98,0.99,0.68,0.92,1,-0.67,-0.12,-0.98,-0.42,0.8,0.83,0.75,-0.62,-0.69,-0.12,0.36,0.71,0.24,-0.56
M,0.78,-0.77,-0.74,-0.23,-0.34,-0.67,1,0.62,0.77,0.18,-0.33,-0.27,-0.6,0.59,0.61,-0.23,-0.07,0.02,0.44,0.91
N,0.53,-0.3,-0.23,0.46,0.13,-0.12,0.62,1,0.3,0.15,0.45,0.42,-0.32,0.02,0.14,-0.11,0.71,0.48,0.82,0.45
O,0.52,-1,-0.99,-0.57,-0.85,-0.98,0.77,0.3,1,0.41,-0.69,-0.72,-0.78,0.6,0.68,0.07,-0.22,-0.58,-0.06,0.64
P,-0.16,-0.41,-0.36,0.01,-0.52,-0.42,0.18,0.15,0.41,1,-0.13,-0.29,-0.03,0.34,0.5,0.9,0.26,-0.49,-0.37,0.04
Q,-0.15,0.69,0.74,0.88,0.8,0.8,-0.33,0.45,-0.69,-0.13,1,0.96,0.51,-0.55,-0.53,0.01,0.8,0.79,0.57,-0.37
R,-0.08,0.72,0.78,0.87,0.89,0.83,-0.27,0.42,-0.72,-0.29,0.96,1,0.46,-0.57,-0.56,-0.17,0.68,0.89,0.63,-0.27
S,-0.44,0.78,0.76,0.38,0.62,0.75,-0.6,-0.32,-0.78,-0.03,0.51,0.46,1,-0.1,-0.21,0.26,0.15,0.36,-0.05,-0.4
T,0.33,-0.6,-0.61,-0.48,-0.48,-0.62,0.59,0.02,0.6,0.34,-0.55,-0.57,-0.1,1,0.86,0.1,-0.38,-0.35,-0.11,0.66
U,0.39,-0.68,-0.67,-0.38,-0.54,-0.69,0.61,0.14,0.68,0.5,-0.53,-0.56,-0.21,0.86,1,0.23,-0.24,-0.39,-0.14,0.67
V,-0.46,-0.07,-0.03,0.12,-0.35,-0.12,-0.23,-0.11,0.07,0.9,0.01,-0.17,0.26,0.1,0.23,1,0.29,-0.48,-0.54,-0.35
W,-0.08,0.22,0.29,0.77,0.33,0.36,-0.07,0.71,-0.22,0.26,0.8,0.68,0.15,-0.38,-0.24,0.29,1,0.44,0.51,-0.24
X,0.2,0.58,0.61,0.71,0.92,0.71,0.02,0.48,-0.58,-0.49,0.79,0.89,0.36,-0.35,-0.39,-0.48,0.44,1,0.8,0.09
Y,0.57,0.06,0.09,0.45,0.52,0.24,0.44,0.82,-0.06,-0.37,0.57,0.63,-0.05,-0.11,-0.14,-0.54,0.51,0.8,1,0.42
Z,0.76,-0.64,-0.63,-0.33,-0.22,-0.56,0.91,0.45,0.64,0.04,-0.37,-0.27,-0.4,0.66,0.67,-0.35,-0.24,0.09,0.42,1
Binary file added 11_MV_DV_correlation.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 66d1014

Please sign in to comment.