# Logistic Regression Model (cost_of_living_by_city)

In [1]:
# Import your dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from path import Path
import numpy
import os

In [2]:
data = Path('../Resources/cost_of_living_index_by_city.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,City,State,Cost of Living Index
0,Anchorage,AK,110.7
1,Fairbanks,AK,107.9
2,Albertville,AL,90.9
3,Anniston,AL,85.7
4,Auburn,AL,91.9


In [3]:
df["outcome"] = df['Cost of Living Index']
df

Unnamed: 0,City,State,Cost of Living Index,outcome
0,Anchorage,AK,110.7,110.7
1,Fairbanks,AK,107.9,107.9
2,Albertville,AL,90.9,90.9
3,Anniston,AL,85.7,85.7
4,Auburn,AL,91.9,91.9
...,...,...,...,...
505,Parkersburg,WV,84.7,84.7
506,Weirton,WV,82.6,82.6
507,Wheeling,WV,84.1,84.1
508,Casper,WY,94.2,94.2


Data Preprocessing

In [4]:
# Change cost of living column to Boolean based on index
index_df= df['outcome']<=100 
index_df

0      False
1      False
2       True
3       True
4       True
       ...  
505     True
506     True
507     True
508     True
509     True
Name: outcome, Length: 510, dtype: bool

In [5]:
# Flip boolean to binary 
index_df.astype(int)

0      0
1      0
2      1
3      1
4      1
      ..
505    1
506    1
507    1
508    1
509    1
Name: outcome, Length: 510, dtype: int32

In [6]:
# Add the converted column back to the DataFrame
df['outcome'] = index_df.astype(int)
df


Unnamed: 0,City,State,Cost of Living Index,outcome
0,Anchorage,AK,110.7,0
1,Fairbanks,AK,107.9,0
2,Albertville,AL,90.9,1
3,Anniston,AL,85.7,1
4,Auburn,AL,91.9,1
...,...,...,...,...
505,Parkersburg,WV,84.7,1
506,Weirton,WV,82.6,1
507,Wheeling,WV,84.1,1
508,Casper,WY,94.2,1


In [8]:
df.dtypes

City                     object
State                    object
Cost of Living Index    float64
outcome                   int32
dtype: object

In [11]:
# Seperate the features (x = input) from the Targes (y = outcome)
y = df['outcome']
X = df.drop(columns='outcome')

In [12]:
# Split out data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(382, 3)

In [13]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [14]:
# Fit (train) or model using the training data
classifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'Albany'

In [None]:
# Make Predictions
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
# Import Accuracy Score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))