In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from pandas_profiling import ProfileReport
import graphviz
import csv

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import export_graphviz
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, precision_recall_curve, r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_breast_cancer

In [5]:
ins_df = pd.read_csv('insurance.csv')
ins_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
X = pd.get_dummies(ins_df.iloc[:, :-1])
y = ins_df.loc[:, 'charges']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [12]:
pipe = make_pipeline(MinMaxScaler(), LinearRegression()).fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7958786376014415

In [17]:
pipe.named_steps['linearregression'].coef_, pipe.named_steps['linearregression'].intercept_

(array([ 1.15069979e+04,  1.23886402e+04,  2.21106880e+03, -6.63153624e+00,
         6.63153624e+00, -1.18283103e+04,  1.18283103e+04,  5.35344576e+02,
         3.42211643e+02, -5.44457080e+02, -3.33099139e+02]),
 9582.347435907923)

In [27]:
lr_coef = pipe.named_steps['linearregression'].coef_
lr_intercept = pipe.named_steps['linearregression'].intercept_
(pd.DataFrame(lr_coef)
.rename(dict(enumerate(X.columns)))
.apply(lambda df: df.div(df.sum(axis=0)))
)

Unnamed: 0,0
age,0.440768
bmi,0.474539
children,0.084694
sex_female,-0.000254
sex_male,0.000254
smoker_no,-0.453076
smoker_yes,0.453076
region_northeast,0.020506
region_northwest,0.013108
region_southeast,-0.020855


In [40]:
pipe = make_pipeline(MinMaxScaler(), 
SGDRegressor(loss='squared_error', penalty='l1', alpha=0.1, learning_rate='invscaling', max_iter=1000, random_state=0)).fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7947894067641075

In [41]:
pipe.named_steps['sgdregressor'].coef_, pipe.named_steps['sgdregressor'].intercept_, pipe.named_steps['sgdregressor'].n_iter_

(array([11593.79110741, 10382.67267152,  2205.8369832 ,  2278.09324513,
         2297.09892774, -9525.95351756, 14114.44893084,  1593.16722028,
         1395.66771951,   770.66960293,   802.38438972]),
 array([4588.49541328]),
 85)

In [42]:
lr_coef = pipe.named_steps['sgdregressor'].coef_
lr_intercept = pipe.named_steps['sgdregressor'].intercept_
(pd.DataFrame(lr_coef)
.rename(dict(enumerate(X.columns)))
.apply(lambda df: df.div(df.sum(axis=0)))
)

Unnamed: 0,0
age,0.305841
bmi,0.273892
children,0.058189
sex_female,0.060096
sex_male,0.060597
smoker_no,-0.251292
smoker_yes,0.372336
region_northeast,0.042027
region_northwest,0.036817
region_southeast,0.02033


In [44]:
pipe = make_pipeline(MinMaxScaler(), PolynomialFeatures(degree=3, interaction_only=True), LinearRegression()).fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.8342006759566594, 0.8664053037323034)

In [45]:
cer_df = pd.read_csv('cereal.csv')
cer_df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [50]:
pd.concat([cer_df.nunique(), cer_df.dtypes], axis=1).head()

Unnamed: 0,0,1
name,77,object
mfr,7,object
type,2,object
calories,11,int64
protein,6,int64


In [53]:
X = pd.get_dummies(cer_df.iloc[:, 1:-1])
y = cer_df.loc[:, 'rating']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [74]:
pipe = make_pipeline(MinMaxScaler(), PolynomialFeatures(degree=3, interaction_only=True), LinearRegression()).fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(1.0, 0.9720882221344329)

In [75]:
pipe.named_steps['linearregression'].coef_, pipe.named_steps['linearregression'].intercept_

(array([ 2.93607069, -3.30030697,  0.63408809, ...,  0.        ,
         0.        ,  0.        ]),
 49.24990667433556)

In [58]:
lr_coef = pipe.named_steps['linearregression'].coef_
lr_intercept = pipe.named_steps['linearregression'].intercept_
(pd.DataFrame(lr_coef)
#.rename(dict(enumerate(X.columns)))
.apply(lambda df: df.div(df.sum(axis=0)))
)

Unnamed: 0,0
0,-0.299307
1,0.336438
2,-0.064640
3,0.430792
4,0.402182
...,...
1557,-0.000000
1558,-0.000000
1559,-0.000000
1560,-0.000000


In [76]:
pipe = make_pipeline(LinearRegression()).fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9999999999999997, 0.9999999999999993)

In [77]:
pd.concat([pd.Series(y_test).reset_index(drop=True), pd.Series(pipe.predict(X_test))], axis=1).head()

Unnamed: 0,rating,0
0,29.924285,29.924285
1,49.787445,49.787445
2,39.7034,39.7034
3,60.756112,60.756112
4,45.811716,45.811716


In [78]:
lr_reg = LinearRegression().fit(X_train, y_train)
lr_reg.score(X_test, y_test)

0.9999999999999993

In [79]:
diabetes = load_diabetes()
diab_df = pd.DataFrame(diabetes['data'], columns=diabetes['feature_names'])
diab_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
