initial commit

nithyadurai87 · Nov 10, 2018 · 66d1014 · 66d1014
commit 66d1014
Show file tree

Hide file tree

Showing 24 changed files with 3,469 additions and 0 deletions.
diff --git a/01_stats_data.py b/01_stats_data.py
@@ -0,0 +1,13 @@
+import matplotlib.pyplot as plt
+
+x = [[6], [8], [10], [14], [18], [21]]
+y = [[7], [9], [13], [17.5], [18], [21]]
+
+plt.figure()
+plt.title('Pizza Price statistics')
+plt.xlabel('Diameter')
+plt.ylabel('dollar price')
+plt.plot(x,y,'.')
+plt.axis([0,25,0,25])
+plt.grid(True)
+plt.show()
diff --git a/02_simple_linear_regression.py b/02_simple_linear_regression.py
@@ -0,0 +1,20 @@
+import matplotlib.pyplot as plt 
+from sklearn.linear_model import LinearRegression
+
+x = [[6], [8], [10], [14], [18]]
+y = [[7], [9], [13], [17.5], [18]]
+
+model = LinearRegression()
+model.fit(x,y)
+
+plt.figure()
+plt.title('Pizza Price Predictions')
+plt.xlabel('Diameter')
+plt.ylabel('dollar price')
+plt.plot(x,y,'.')
+plt.plot(x,model.predict(x),'--')
+plt.axis([0,25,0,25])
+plt.grid(True)
+plt.show()
+
+print (model.predict([[21]]))
diff --git a/03_loss_functions.py b/03_loss_functions.py
@@ -0,0 +1,14 @@
+from sklearn.linear_model import LinearRegression
+import numpy as np
+from numpy.linalg import inv,lstsq
+from numpy import dot, transpose
+
+x = [[6], [8], [10], [14], [18]]
+y = [[7], [9], [13], [17.5], [18]]
+
+model = LinearRegression()
+model.fit(x,y)
+
+print ('Residual sum of squares = ',np.mean((model.predict(x)- y) ** 2))
+print ('Variance = ',np.var([6, 8, 10, 14, 18], ddof=1))
+print ('Co-variance = ',np.cov([6, 8, 10, 14, 18], [7, 9, 13, 17.5, 18])[0][1])
diff --git a/04_scoring.py b/04_scoring.py
@@ -0,0 +1,14 @@
+from sklearn.linear_model import LinearRegression
+import numpy as np
+from numpy.linalg import inv,lstsq
+from numpy import dot, transpose
+
+x = [[6], [8], [10], [14], [18]]
+y = [[7], [9], [13], [17.5], [18]]
+model = LinearRegression()
+model.fit(x,y)
+
+x_test = [[8], [9], [11], [16], [12]]
+y_test = [[11], [8.5], [15], [18], [11]]
+
+print ('R-squared score = ',model.score(x_test, y_test))
diff --git a/05_multiple_linear_regression.py b/05_multiple_linear_regression.py
@@ -0,0 +1,21 @@
+from sklearn.linear_model import LinearRegression
+from numpy.linalg import lstsq
+
+x = [[6, 2], [8, 1], [10, 0], [14, 2], [18, 0]]
+y = [[7], [9], [13], [17.5], [18]]
+model = LinearRegression()
+model.fit(x,y)
+
+x1 = [[8, 2], [9, 0], [11, 2], [16, 2], [12, 0]]
+y1 = [[11], [8.5], [15], [18], [11]]
+
+predictions = model.predict(x1)
+for i, prediction in enumerate(predictions):
+	print ((prediction, y1[i]))
+
+print (lstsq(x, y, rcond=None)[0])
+
+print ('R-squared score = ',model.score(x1, y1))
+
+
+
diff --git a/06_input_data.csv b/06_input_data.csv
diff --git a/06_output_data.csv b/06_output_data.csv
diff --git a/06_pandas.py b/06_pandas.py
@@ -0,0 +1,70 @@
+import pandas as pd
+
+# data can be downloaded from the url: https://www.kaggle.com/vikrishnan/boston-house-prices 
+df = pd.read_csv('./06_input_data.csv')
+
+# Understanding data
+print (df.shape)
+print (df.columns)
+print(df.head(5))
+print(df.info())
+print(df.describe())
+print(df.groupby('LotShape').size())
+
+# Dropping null value columns which cross the threshold
+a = df.isnull().sum()
+print (a)
+b =  a[a>(0.05*len(a))]
+print (b)
+df = df.drop(b.index, axis=1)
+print (df.shape)
+
+# Replacing null value columns (text) with most used value
+a1 = df.select_dtypes(include=['object']).isnull().sum()
+print (a1)
+print (a1.index)
+for i in a1.index:
+	b1 = df[i].value_counts().index.tolist()
+	print (b1)
+	df[i] = df[i].fillna(b1[0])
+
+# Replacing null value columns (int, float) with most used value
+a2 = df.select_dtypes(include=['integer','float']).isnull().sum()
+print (a2)
+b2 = a2[a2!=0].index 
+print (b2)
+df = df.fillna(df[b2].mode().to_dict(orient='records')[0])
+
+# Creating new columns from existing columns
+print (df.shape)
+a3 = df['YrSold'] - df['YearBuilt']
+b3 = df['YrSold'] - df['YearRemodAdd']
+df['Years Before Sale'] = a3
+df['Years Since Remod'] = b3
+print (df.shape)
+
+# Dropping unwanted columns
+df = df.drop(["Id", "MoSold", "SaleCondition", "SaleType", "YearBuilt", "YearRemodAdd"], axis=1) 
+print (df.shape)
+
+# Dropping columns which has correlation with target less than threshold
+target='SalePrice'
+x = df.select_dtypes(include=['integer','float']).corr()[target].abs()
+print (x)  
+df=df.drop(x[x<0.4].index, axis=1)
+print (df.shape)
+
+# Checking for the necessary features after dropping some columns
+l1 = ["PID","MS SubClass","MS Zoning","Street","Alley","Land Contour","Lot Config","Neighborhood","Condition 1","Condition 2","Bldg Type","House Style","Roof Style","Roof Matl","Exterior 1st","Exterior 2nd","Mas Vnr Type","Foundation","Heating","Central Air","Garage Type","Misc Feature","Sale Type","Sale Condition"]
+l2 = []
+for i in l1:
+    if i in df.columns:
+        l2.append(i)
+
+# Getting rid of nominal columns with too many unique values
+for i in l2:
+    len(df[i].unique())>10
+    df=df.drop(i, axis=1)
+print (df.columns)
+
+df.to_csv('06_output_data.csv',index=False)
diff --git a/07_model.py b/07_model.py
@@ -0,0 +1,44 @@
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split,cross_val_score
+from sklearn.externals import joblib
+from sklearn.metrics import mean_squared_error
+import matplotlib.pyplot as plt
+from math import sqrt
+import os
+
+df = pd.read_csv('./06_output_data.csv')
+
+i = list(df.columns.values)
+i.pop(i.index('SalePrice'))
+df0 = df[i+['SalePrice']]
+df = df0.select_dtypes(include=['integer','float'])
+print (df.columns)
+
+X = df[list(df.columns)[:-1]]
+y = df['SalePrice']
+X_train, X_test, y_train, y_test = train_test_split(X, y)
+regressor = LinearRegression()
+regressor.fit(X_train, y_train)
+
+y_predictions = regressor.predict(X_test)
+
+meanSquaredError=mean_squared_error(y_test, y_predictions)
+rootMeanSquaredError = sqrt(meanSquaredError)
+
+print("Number of predictions:",len(y_predictions))
+print("Mean Squared Error:", meanSquaredError)
+print("Root Mean Squared Error:", rootMeanSquaredError)
+print ("Scoring:",regressor.score(X_test, y_test))
+
+plt.plot(y_predictions,y_test,'r.') 
+plt.plot(y_predictions,y_predictions,'k-') 
+plt.title('Parity Plot - Linear Regression')
+plt.show()
+
+plot = plt.scatter(y_predictions, (y_predictions - y_test), c='b')
+plt.hlines(y=0, xmin= 100000, xmax=400000)
+plt.title('Residual Plot - Linear Regression')
+plt.show()
+
+joblib.dump(regressor, './07_output_salepricemodel.pkl')
diff --git a/07_output_salepricemodel.pkl b/07_output_salepricemodel.pkl
diff --git a/08_input.json b/08_input.json
@@ -0,0 +1,13 @@
+{
+	"OverallQual":[7],
+	"TotalBsmtSF":[856],
+	"1stFlrSF":[856],
+	"GrLivArea":[1710],
+	"FullBath":[2],
+	"TotRmsAbvGrd":[8],
+	"Fireplaces":[0],
+	"GarageCars":[2],
+	"GarageArea":[548],
+	"Years Before Sale":[5],
+	"Years Since Remod":[5]
+}	
diff --git a/08_predict.py b/08_predict.py
@@ -0,0 +1,10 @@
+import os 
+import json
+import pandas as pd
+import numpy
+from sklearn.externals import joblib
+
+s = pd.read_json('./08_input.json')
+p = joblib.load("./07_output_salepricemodel.pkl")
+r = p.predict(s)
+print (str(r))
diff --git a/09_flask_api.py b/09_flask_api.py
@@ -0,0 +1,31 @@
+import os 
+import json
+import pandas as pd
+import numpy
+from flask import Flask, render_template, request, jsonify
+from pandas.io.json import json_normalize
+from sklearn.externals import joblib
+
+app = Flask(__name__)
+port = int(os.getenv('PORT', 5500))
+
+@app.route('/')
+def home():
+    return render_template('index.html')
+
+@app.route('/api/salepricemodel', methods=['POST'])
+def salepricemodel():
+    if request.method == 'POST':
+        try:
+            post_data = request.get_json()
+            json_data = json.dumps(post_data)
+            s = pd.read_json(json_data)          
+            p = joblib.load("./07_output_salepricemodel.pkl")
+            r = p.predict(s)
+            return str(r) 
+
+        except Exception as e:
+            return (e)
+
+if __name__ == '__main__':
+	app.run(host='0.0.0.0', port=port, debug=True)
diff --git a/10_model_compare.py b/10_model_compare.py
@@ -0,0 +1,102 @@
+import pandas as pd
+from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
+from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.neural_network import MLPRegressor
+from sklearn.model_selection import train_test_split,cross_val_score
+from sklearn.externals import joblib
+from sklearn.metrics import mean_squared_error
+import matplotlib.pyplot as plt
+from math import sqrt
+import numpy as np
+import os
+
+df = pd.read_csv('./06_output_data.csv')
+
+i = list(df.columns.values)
+i.pop(i.index('SalePrice'))
+df0 = df[i+['SalePrice']]
+df = df0.select_dtypes(include=['integer','float'])
+
+X = df[list(df.columns)[:-1]]
+y = df['SalePrice']
+X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+def linear():
+	regressor = LinearRegression()
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def ridge():
+	regressor = Ridge(alpha=.3, normalize=True)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def lasso():
+	regressor = Lasso(alpha=0.00009, normalize=True)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def elasticnet():
+	regressor = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def randomforest():
+	regressor = RandomForestRegressor(n_estimators=15,min_samples_split=15,criterion='mse',max_depth=None)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	print("Selected Features for RamdomForest",regressor.feature_importances_)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def perceptron():
+	regressor = MLPRegressor(hidden_layer_sizes=(5000,), activation='relu', solver='adam', max_iter=1000)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	print("Co-efficients of Perceptron",regressor.coefs_)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def decisiontree():
+	regressor = DecisionTreeRegressor(min_samples_split=30,max_depth=None)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	print("Selected Features for DecisionTrees",regressor.feature_importances_)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def adaboost():
+	regressor = AdaBoostRegressor(random_state=8, loss='exponential').fit(X_train, y_train)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	print("Selected Features for Adaboost",regressor.feature_importances_)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def extratrees():
+	regressor = ExtraTreesRegressor(n_estimators=50).fit(X_train, y_train)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	print("Selected Features for Extratrees",regressor.feature_importances_)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+def gradientboosting():
+	regressor = GradientBoostingRegressor(loss='ls',n_estimators=500, min_samples_split=15).fit(X_train, y_train)
+	regressor.fit(X_train, y_train)
+	y_predictions = regressor.predict(X_test)
+	print("Selected Features for Gradientboosting",regressor.feature_importances_)
+	return (regressor.score(X_test, y_test),sqrt(mean_squared_error(y_test, y_predictions)))
+
+print ("Score, RMSE values")
+print ("Linear = ",linear())
+print ("Ridge = ",ridge())
+print ("Lasso = ",lasso())
+print ("ElasticNet = ",elasticnet())
+print ("RandomForest = ",randomforest())
+print ("Perceptron = ",perceptron())
+print ("DecisionTree = ",decisiontree())
+print ("AdaBoost = ",adaboost())
+print ("ExtraTrees = ",extratrees())
+print ("GradientBoosting = ",gradientboosting())
+
diff --git a/11_MV_DV_correlation.csv b/11_MV_DV_correlation.csv
@@ -0,0 +1,21 @@
+,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z
+G,1,-0.52,-0.5,-0.19,-0.11,-0.42,0.78,0.53,0.52,-0.16,-0.15,-0.08,-0.44,0.33,0.39,-0.46,-0.08,0.2,0.57,0.76
+H,-0.52,1,0.99,0.57,0.85,0.98,-0.77,-0.3,-1,-0.41,0.69,0.72,0.78,-0.6,-0.68,-0.07,0.22,0.58,0.06,-0.64
+I,-0.5,0.99,1,0.66,0.87,0.99,-0.74,-0.23,-0.99,-0.36,0.74,0.78,0.76,-0.61,-0.67,-0.03,0.29,0.61,0.09,-0.63
+J,-0.19,0.57,0.66,1,0.71,0.68,-0.23,0.46,-0.57,0.01,0.88,0.87,0.38,-0.48,-0.38,0.12,0.77,0.71,0.45,-0.33
+K,-0.11,0.85,0.87,0.71,1,0.92,-0.34,0.13,-0.85,-0.52,0.8,0.89,0.62,-0.48,-0.54,-0.35,0.33,0.92,0.52,-0.22
+L,-0.42,0.98,0.99,0.68,0.92,1,-0.67,-0.12,-0.98,-0.42,0.8,0.83,0.75,-0.62,-0.69,-0.12,0.36,0.71,0.24,-0.56
+M,0.78,-0.77,-0.74,-0.23,-0.34,-0.67,1,0.62,0.77,0.18,-0.33,-0.27,-0.6,0.59,0.61,-0.23,-0.07,0.02,0.44,0.91
+N,0.53,-0.3,-0.23,0.46,0.13,-0.12,0.62,1,0.3,0.15,0.45,0.42,-0.32,0.02,0.14,-0.11,0.71,0.48,0.82,0.45
+O,0.52,-1,-0.99,-0.57,-0.85,-0.98,0.77,0.3,1,0.41,-0.69,-0.72,-0.78,0.6,0.68,0.07,-0.22,-0.58,-0.06,0.64
+P,-0.16,-0.41,-0.36,0.01,-0.52,-0.42,0.18,0.15,0.41,1,-0.13,-0.29,-0.03,0.34,0.5,0.9,0.26,-0.49,-0.37,0.04
+Q,-0.15,0.69,0.74,0.88,0.8,0.8,-0.33,0.45,-0.69,-0.13,1,0.96,0.51,-0.55,-0.53,0.01,0.8,0.79,0.57,-0.37
+R,-0.08,0.72,0.78,0.87,0.89,0.83,-0.27,0.42,-0.72,-0.29,0.96,1,0.46,-0.57,-0.56,-0.17,0.68,0.89,0.63,-0.27
+S,-0.44,0.78,0.76,0.38,0.62,0.75,-0.6,-0.32,-0.78,-0.03,0.51,0.46,1,-0.1,-0.21,0.26,0.15,0.36,-0.05,-0.4
+T,0.33,-0.6,-0.61,-0.48,-0.48,-0.62,0.59,0.02,0.6,0.34,-0.55,-0.57,-0.1,1,0.86,0.1,-0.38,-0.35,-0.11,0.66
+U,0.39,-0.68,-0.67,-0.38,-0.54,-0.69,0.61,0.14,0.68,0.5,-0.53,-0.56,-0.21,0.86,1,0.23,-0.24,-0.39,-0.14,0.67
+V,-0.46,-0.07,-0.03,0.12,-0.35,-0.12,-0.23,-0.11,0.07,0.9,0.01,-0.17,0.26,0.1,0.23,1,0.29,-0.48,-0.54,-0.35
+W,-0.08,0.22,0.29,0.77,0.33,0.36,-0.07,0.71,-0.22,0.26,0.8,0.68,0.15,-0.38,-0.24,0.29,1,0.44,0.51,-0.24
+X,0.2,0.58,0.61,0.71,0.92,0.71,0.02,0.48,-0.58,-0.49,0.79,0.89,0.36,-0.35,-0.39,-0.48,0.44,1,0.8,0.09
+Y,0.57,0.06,0.09,0.45,0.52,0.24,0.44,0.82,-0.06,-0.37,0.57,0.63,-0.05,-0.11,-0.14,-0.54,0.51,0.8,1,0.42
+Z,0.76,-0.64,-0.63,-0.33,-0.22,-0.56,0.91,0.45,0.64,0.04,-0.37,-0.27,-0.4,0.66,0.67,-0.35,-0.24,0.09,0.42,1
diff --git a/11_MV_DV_correlation.png b/11_MV_DV_correlation.png