In [2]:
 %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('./Resources/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv')
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


# 1. **Data Exploration**

In [4]:
# Identify categorical variables vs numberic
categorical_variables = data.select_dtypes(exclude=['int', 'float','number']).columns
numeric_variables = data.select_dtypes(include=['int', 'float','number']).columns

print(f'Categorical: {categorical_variables}')
print(f'Numeric: {numeric_variables}')

print(len(categorical_variables))
print(len(numeric_variables))

Categorical: Index(['Customer', 'State', 'Response', 'Coverage', 'Education',
       'Effective To Date', 'EmploymentStatus', 'Gender', 'Location Code',
       'Marital Status', 'Policy Type', 'Policy', 'Renew Offer Type',
       'Sales Channel', 'Vehicle Class', 'Vehicle Size'],
      dtype='object')
Numeric: Index(['Customer Lifetime Value', 'Income', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies',
       'Total Claim Amount'],
      dtype='object')
16
8


In [5]:
# There is a total of 16 categorical variables and 8 numeric variables
# Understand the data
for col in data.columns: 
    print(col)
len(list(data.columns.values))

Customer
State
Customer Lifetime Value
Response
Coverage
Education
Effective To Date
EmploymentStatus
Gender
Income
Location Code
Marital Status
Monthly Premium Auto
Months Since Last Claim
Months Since Policy Inception
Number of Open Complaints
Number of Policies
Policy Type
Policy
Renew Offer Type
Sales Channel
Total Claim Amount
Vehicle Class
Vehicle Size


24

In [6]:
# Understand Categorical Values
state = data.State.unique()
print(f'States: {state}')
print(len(state))
print("-----------------------")

Response = data.Response.unique()
print(f'Response: {Response}')
print(len(Response))
print("-----------------------")

coverage = data.Coverage.unique()
print(f'Response: {coverage}')
print(len(coverage))
print("-----------------------")

education = data.Education.unique()
print(f'Response: {education}')
print(len(education))
print("-----------------------")

policy = data.Policy.unique()
print(f'Response: {policy}')
print(len(policy))
print("-----------------------")

States: ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon']
5
-----------------------
Response: ['No' 'Yes']
2
-----------------------
Response: ['Basic' 'Extended' 'Premium']
3
-----------------------
Response: ['Bachelor' 'College' 'Master' 'High School or Below' 'Doctor']
5
-----------------------
Response: ['Corporate L3' 'Personal L3' 'Corporate L2' 'Personal L1' 'Special L2'
 'Corporate L1' 'Personal L2' 'Special L1' 'Special L3']
9
-----------------------


In [47]:
policy = data['Vehicle Class'].unique()
print(f'Response: {policy}')
print(len(policy))
print("-----------------------")

Response: ['Two-Door Car' 'Four-Door Car' 'SUV' 'Luxury SUV' 'Sports Car'
 'Luxury Car']
6
-----------------------


In [7]:
# Customer LTV range
# 

print("Min:")
print(min(data["Customer Lifetime Value"]))
print("Max:")
print(max(data["Customer Lifetime Value"]))
print("-------------")
print("Average:")
print(data['Customer Lifetime Value'].mean())
print("Median:")
print(data['Customer Lifetime Value'].median())



Min:
1898.007675
Max:
83325.38119
-------------
Average:
8004.9404749870755
Median:
5780.182197


# 2. **Data Processing**

In [8]:
# Identify and eliminate any NA values in the data set
# Number of entries
print("---------------------")
print(len(data))
# Drop any NA values
data.dropna()
print("---------------------")
print(len(data))

---------------------
9134
---------------------
9134


In [9]:
# Drop unneccessary columns
new_data = data.drop(columns=['Effective To Date', 'Customer'])
new_data

Unnamed: 0,State,Customer Lifetime Value,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,Washington,2763.519279,No,Basic,Bachelor,Employed,F,56274,Suburban,Married,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,Arizona,6979.535903,No,Extended,Bachelor,Unemployed,F,0,Suburban,Single,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,Nevada,12887.431650,No,Premium,Bachelor,Employed,F,48767,Suburban,Married,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,California,7645.861827,No,Basic,Bachelor,Unemployed,M,0,Suburban,Married,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,Washington,2813.692575,No,Basic,Bachelor,Employed,M,43836,Rural,Single,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,California,23405.987980,No,Basic,Bachelor,Employed,M,71941,Urban,Married,...,89,0,2,Personal Auto,Personal L1,Offer2,Web,198.234764,Four-Door Car,Medsize
9130,California,3096.511217,Yes,Extended,College,Employed,F,21604,Suburban,Divorced,...,28,0,1,Corporate Auto,Corporate L3,Offer1,Branch,379.200000,Four-Door Car,Medsize
9131,California,8163.890428,No,Extended,Bachelor,Unemployed,M,0,Suburban,Single,...,37,3,2,Corporate Auto,Corporate L2,Offer1,Branch,790.784983,Four-Door Car,Medsize
9132,California,7524.442436,No,Extended,College,Employed,M,21941,Suburban,Married,...,3,0,3,Personal Auto,Personal L2,Offer3,Branch,691.200000,Four-Door Car,Large


**Regression Analysys With Continuous Variables Only**

In [10]:
# new Library for Regression Analysis
import statsmodels.api as sm

In [11]:
continous_var_df = new_data.select_dtypes(include=['int64','float'])
continous_var_df.nunique()
continous_var_df.head()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
0,2763.519279,56274,69,32,5,0,1,384.811147
1,6979.535903,0,94,13,42,0,8,1131.464935
2,12887.43165,48767,108,18,38,0,2,566.472247
3,7645.861827,0,106,18,65,0,7,529.881344
4,2813.692575,43836,73,12,44,0,1,138.130879


In [12]:
continous_var_df.columns
continous_var_df.head()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
0,2763.519279,56274,69,32,5,0,1,384.811147
1,6979.535903,0,94,13,42,0,8,1131.464935
2,12887.43165,48767,108,18,38,0,2,566.472247
3,7645.861827,0,106,18,65,0,7,529.881344
4,2813.692575,43836,73,12,44,0,1,138.130879


In [13]:
#Normalize the data to feed lineraRegression function
from sklearn import preprocessing
x = continous_var_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
continous_var_df_normalized = pd.DataFrame(x_scaled)

continous_var_df_normalized.head()




Unnamed: 0,0,1,2,3,4,5,6,7
0,0.010629,0.562847,0.033755,0.914286,0.050505,0.0,0.0,0.132974
1,0.062406,0.0,0.139241,0.371429,0.424242,0.0,0.875,0.391051
2,0.13496,0.487763,0.198312,0.514286,0.383838,0.0,0.125,0.195764
3,0.070589,0.0,0.189873,0.514286,0.656566,0.0,0.75,0.183117
4,0.011245,0.438443,0.050633,0.342857,0.444444,0.0,0.0,0.04771


In [14]:
continous_var_reg = sm.Logit(continous_var_df_normalized[0], continous_var_df_normalized.drop(0, axis = 1))
continous_var_reg.fit().summary()

Optimization terminated successfully.
         Current function value: 0.143238
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,9134.0
Model:,Logit,Df Residuals:,9127.0
Method:,MLE,Df Model:,6.0
Date:,"Mon, 20 Jul 2020",Pseudo R-squ.:,-7.968
Time:,12:50:37,Log-Likelihood:,-1308.3
converged:,True,LL-Null:,-145.89
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
1.0,-1.6625,0.123,-13.485,0.000,-1.904,-1.421
2.0,3.8137,0.299,12.760,0.000,3.228,4.399
3.0,-1.2582,0.124,-10.175,0.000,-1.501,-1.016
4.0,-1.4743,0.120,-12.305,0.000,-1.709,-1.239
5.0,-0.8848,0.243,-3.634,0.000,-1.362,-0.408
6.0,-0.5958,0.132,-4.525,0.000,-0.854,-0.338
7.0,-6.2850,0.441,-14.265,0.000,-7.149,-5.421


In [15]:
#### All P-values are below 0.05 we can assume all continous vaiables are important ###

**Regression Analysis with Categorical Variables**

In [16]:
cat_df = new_data.select_dtypes(include='object')

In [17]:
cat_df.nunique()

State               5
Response            2
Coverage            3
Education           5
EmploymentStatus    5
Gender              2
Location Code       3
Marital Status      3
Policy Type         3
Policy              9
Renew Offer Type    4
Sales Channel       4
Vehicle Class       6
Vehicle Size        3
dtype: int64

In [18]:
cat_df.columns

Index(['State', 'Response', 'Coverage', 'Education', 'EmploymentStatus',
       'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy',
       'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size'],
      dtype='object')

In [19]:
cols = ['State', 'Coverage', 'Education', 'EmploymentStatus', 'Gender',
       'Location Code', 'Marital Status', 'Policy Type', 'Policy',
       'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size','Response']

In [20]:
# Convert Categorical Varables
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
for col in cat_df[cols]:
    cat_df[col] = lb.fit_transform(cat_df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [21]:
cat_df.head()

Unnamed: 0,State,Response,Coverage,Education,EmploymentStatus,Gender,Location Code,Marital Status,Policy Type,Policy,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size
0,4,0,0,0,1,0,1,1,0,2,0,0,5,1
1,0,0,1,0,4,0,1,2,1,5,2,0,0,1
2,2,0,2,0,1,0,1,1,1,5,0,0,5,1
3,1,0,0,0,4,1,1,1,0,1,0,2,3,1
4,4,0,0,0,1,1,0,2,1,3,0,0,0,1


In [22]:
x_cat = cat_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x_cat)
cat_df_normalized = pd.DataFrame(x_scaled)

cat_df_normalized.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.0,0.25,0.0,0.5,0.5,0.0,0.25,0.0,0.0,1.0,0.5
1,0.0,0.0,0.5,0.0,1.0,0.0,0.5,1.0,0.5,0.625,0.666667,0.0,0.0,0.5
2,0.5,0.0,1.0,0.0,0.25,0.0,0.5,0.5,0.5,0.625,0.0,0.0,1.0,0.5
3,0.25,0.0,0.0,0.0,1.0,1.0,0.5,0.5,0.0,0.125,0.0,0.666667,0.6,0.5
4,1.0,0.0,0.0,0.0,0.25,1.0,0.0,1.0,0.5,0.375,0.0,0.0,0.0,0.5


In [23]:
categorical_train = sm.Logit(continous_var_df_normalized[0], cat_df)
categorical_train.fit().summary()

Optimization terminated successfully.
         Current function value: 0.132800
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,9134.0
Model:,Logit,Df Residuals:,9120.0
Method:,MLE,Df Model:,13.0
Date:,"Mon, 20 Jul 2020",Pseudo R-squ.:,-7.315
Time:,12:50:38,Log-Likelihood:,-1213.0
converged:,True,LL-Null:,-145.89
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
State,-0.1066,0.029,-3.655,0.000,-0.164,-0.049
Response,-0.3603,0.115,-3.127,0.002,-0.586,-0.134
Coverage,0.1337,0.055,2.419,0.016,0.025,0.242
Education,-0.0645,0.027,-2.354,0.019,-0.118,-0.011
EmploymentStatus,-0.1266,0.031,-4.031,0.000,-0.188,-0.065
Gender,-0.2079,0.076,-2.725,0.006,-0.357,-0.058
Location Code,-0.2491,0.058,-4.288,0.000,-0.363,-0.135
Marital Status,-0.2839,0.061,-4.651,0.000,-0.404,-0.164
Policy Type,0.4244,0.162,2.622,0.009,0.107,0.742


In [24]:
## All cateforical variables seem to have significant relationship with the output variable 'Customer life time value', except 
## vehicle class 

In [25]:
# Conversion of categorical data on the main data frame
# Machine Learning algorithms work with numerical data. We have to convert our strings into meaningful numbers.
# We often use Integer, One-hot, or Binary Encoding. Sklearn provides a preprocessing libarary for all of these 
# standard preprocessing techniques. Pandas also provides a get_dummies method that is useful to generate binary 
# encoded data from a Data Frame.

new_data.columns
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
for col in new_data[cols]:
    new_data[col] = lb.fit_transform(new_data[col])
    
new_data.head()

Unnamed: 0,State,Customer Lifetime Value,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,4,2763.519279,0,0,0,1,0,56274,1,1,...,5,0,1,0,2,0,0,384.811147,5,1
1,0,6979.535903,0,1,0,4,0,0,1,2,...,42,0,8,1,5,2,0,1131.464935,0,1
2,2,12887.43165,0,2,0,1,0,48767,1,1,...,38,0,2,1,5,0,0,566.472247,5,1
3,1,7645.861827,0,0,0,4,1,0,1,1,...,65,0,7,0,1,0,2,529.881344,3,1
4,4,2813.692575,0,0,0,1,1,43836,0,2,...,44,0,1,1,3,0,0,138.130879,0,1


In [26]:
# Drop any variables that with a P-value > 0.05
new_data_converted = new_data.drop(columns=['Vehicle Class'])
new_data_converted.head()

Unnamed: 0,State,Customer Lifetime Value,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,...,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Size
0,4,2763.519279,0,0,0,1,0,56274,1,1,...,32,5,0,1,0,2,0,0,384.811147,1
1,0,6979.535903,0,1,0,4,0,0,1,2,...,13,42,0,8,1,5,2,0,1131.464935,1
2,2,12887.43165,0,2,0,1,0,48767,1,1,...,18,38,0,2,1,5,0,0,566.472247,1
3,1,7645.861827,0,0,0,4,1,0,1,1,...,18,65,0,7,0,1,0,2,529.881344,1
4,4,2813.692575,0,0,0,1,1,43836,0,2,...,12,44,0,1,1,3,0,0,138.130879,1


In [27]:
# Add a column of 'Quartile LTV'. This column will determine the quartile where each 
# customer falls in thems of Customer Life Time Value.

# First, find the quartile threshold values

ltv = new_data["Customer Lifetime Value"]  

print("Q3 quantile of arr : ", np.quantile(ltv, .25)) 
print("Q2 quantile of arr : ", np.quantile(ltv, .50))  
print("Q1 quantile of arr : ", np.quantile(ltv, .75)) 

Q3 quantile of arr :  3994.25179425
Q2 quantile of arr :  5780.182197
Q1 quantile of arr :  8962.16704125


In [28]:
new_data_converted.loc[(new_data_converted['Customer Lifetime Value'] >0) & (new_data_converted['Customer Lifetime Value']<= 3994.26), 'Quartile LTV'] = 4 
new_data_converted.loc[(new_data_converted['Customer Lifetime Value'] > 3994.26)& (new_data_converted['Customer Lifetime Value'] <= 5780.182197), 'Quartile LTV'] = 3
new_data_converted.loc[(new_data_converted['Customer Lifetime Value'] > 5780.19) & (new_data_converted['Customer Lifetime Value']<= 8962.16), 'Quartile LTV'] = 2
new_data_converted.loc[(new_data_converted['Customer Lifetime Value'] > 8962.17), 'Quartile LTV'] = 1
new_data_converted.head()

Unnamed: 0,State,Customer Lifetime Value,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Size,Quartile LTV
0,4,2763.519279,0,0,0,1,0,56274,1,1,...,5,0,1,0,2,0,0,384.811147,1,4.0
1,0,6979.535903,0,1,0,4,0,0,1,2,...,42,0,8,1,5,2,0,1131.464935,1,2.0
2,2,12887.43165,0,2,0,1,0,48767,1,1,...,38,0,2,1,5,0,0,566.472247,1,1.0
3,1,7645.861827,0,0,0,4,1,0,1,1,...,65,0,7,0,1,0,2,529.881344,1,2.0
4,4,2813.692575,0,0,0,1,1,43836,0,2,...,44,0,1,1,3,0,0,138.130879,1,4.0


In [29]:
#Drop 'Customer Lifetime Value' variable
new_data_quartileLTV = new_data.drop(columns=['Customer Lifetime Value'])
new_data_quartileLTV.head()

Unnamed: 0,State,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,4,0,0,0,1,0,56274,1,1,69,...,5,0,1,0,2,0,0,384.811147,5,1
1,0,0,1,0,4,0,0,1,2,94,...,42,0,8,1,5,2,0,1131.464935,0,1
2,2,0,2,0,1,0,48767,1,1,108,...,38,0,2,1,5,0,0,566.472247,5,1
3,1,0,0,0,4,1,0,1,1,106,...,65,0,7,0,1,0,2,529.881344,3,1
4,4,0,0,0,1,1,43836,0,2,73,...,44,0,1,1,3,0,0,138.130879,0,1



# 3. **Linear Regression Model With Quartiles**

In [30]:
# Assign X (data) and y (target)
X = new_data_quartileLTV.drop("Quartile LTV", axis=1)
y = new_data_quartileLTV["Quartile LTV"].values.reshape(-1, 1)
print(X.shape, y.shape)


KeyError: "['Quartile LTV'] not found in axis"

In [31]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)


NameError: name 'X' is not defined

In [32]:
# Create the model
from sklearn.linear_model import LinearRegression
linear_regression_model_1 = LinearRegression()

In [33]:
# Fit the model to the training data and calculate the scores for the training and testing data

linear_regression_model_1.fit(X_train, y_train)
training_score = linear_regression_model_1 .score(X_train, y_train)
testing_score = linear_regression_model_1 .score(X_test, y_test)


print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

NameError: name 'X_train' is not defined

In [None]:
plt.scatter(linear_regression_model_1.predict(X_train), linear_regression_model_1.predict(X_train) - y_train, c="blue", label="Training Data")
plt.scatter(linear_regression_model_1.predict(X_test), linear_regression_model_1.predict(X_test) - y_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual Plot")

In [34]:
linear_regression_model_1.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [41]:
print('intercept:', linear_regression_model_1.intercept_)
print('slope:', linear_regression_model_1.coef_)

AttributeError: 'LinearRegression' object has no attribute 'intercept_'

# **Liear Regression Model LTV**

In [36]:
# Assign X (data) and y (target)
X_1 = new_data_converted.drop("Customer Lifetime Value", axis=1)
y_1 = new_data_converted["Customer Lifetime Value"].values.reshape(-1, 1)
print(X_1.shape, y_1.shape)

(9134, 21) (9134, 1)


In [37]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, random_state=100)

In [38]:
# Create the model
from sklearn.linear_model import LinearRegression
linear_regression_model_2 = LinearRegression()

In [39]:

linear_regression_model_2.fit(X_train_1, y_train_1)
training_score = linear_regression_model_2.score(X_train_1, y_train_1)
testing_score = linear_regression_model_2.score(X_test_1, y_test_1)


print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.5559320714561835
Testing Score: 0.5408259109413627


In [40]:
print('intercept:', linear_regression_model_2.intercept_)
print('slope:', linear_regression_model_2.coef_)

intercept: [18637.93053918]
slope: [[ 1.70081645e+01 -2.81594432e+02 -7.48986218e+02  3.55349225e+01
  -4.74070410e+01 -1.25256459e+02  3.08558733e-04 -9.78454342e+00
  -6.72601788e+01  2.29267936e+01  1.98827993e+00  1.91230061e+00
  -6.16682875e+01 -4.90643365e+02  9.88582013e+01  3.11968235e+01
  -1.31104313e+01  5.52430023e+01  6.39405700e-02 -1.49953625e+01
  -4.45587387e+03]]
