In [1]:
# Logistic Regression Model

In [2]:
# Import your dependencies
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.model_selection import train_test_split

In [3]:
# Read in the file
data = Path('../Resources/cost_of_living_index_by_city.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,City,State,Cost of Living Index
0,Anchorage,AK,110.7
1,Fairbanks,AK,107.9
2,Albertville,AL,90.9
3,Anniston,AL,85.7
4,Auburn,AL,91.9


In [4]:
# Can only use numerical values (Update City)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = df.copy()
df2['City'] = le.fit_transform(df2['City'])
df2

Unnamed: 0,City,State,Cost of Living Index
0,13,AK,110.7
1,126,AK,107.9
2,6,AL,90.9
3,15,AL,85.7
4,22,AL,91.9
...,...,...,...
505,322,WV,84.7
506,451,WV,82.6
507,453,WV,84.1
508,62,WY,94.2


In [5]:
# Can only use numerical values (Update State)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df3 = df2.copy()
df3['State'] = le.fit_transform(df3['State'])
df3

Unnamed: 0,City,State,Cost of Living Index
0,13,0,110.7
1,126,0,107.9
2,6,1,90.9
3,15,1,85.7
4,22,1,91.9
...,...,...,...
505,322,49,84.7
506,451,49,82.6
507,453,49,84.1
508,62,50,94.2


In [6]:
# Update Column for index
df3["outcome"] = df3['Cost of Living Index']
df3

Unnamed: 0,City,State,Cost of Living Index,outcome
0,13,0,110.7,110.7
1,126,0,107.9,107.9
2,6,1,90.9,90.9
3,15,1,85.7,85.7
4,22,1,91.9,91.9
...,...,...,...,...
505,322,49,84.7,84.7
506,451,49,82.6,82.6
507,453,49,84.1,84.1
508,62,50,94.2,94.2


Data Preprocessing

In [7]:
# Change cost of living column to Boolean based on index
index_df= df3['outcome']<=100 
index_df

0      False
1      False
2       True
3       True
4       True
       ...  
505     True
506     True
507     True
508     True
509     True
Name: outcome, Length: 510, dtype: bool

In [8]:
# Flip boolean to binary 
index_df.astype(int)

0      0
1      0
2      1
3      1
4      1
      ..
505    1
506    1
507    1
508    1
509    1
Name: outcome, Length: 510, dtype: int32

In [9]:
## Add the converted column back to the DataFrame
df3['outcome'] = index_df.astype(int)
df3

Unnamed: 0,City,State,Cost of Living Index,outcome
0,13,0,110.7,0
1,126,0,107.9,0
2,6,1,90.9,1
3,15,1,85.7,1
4,22,1,91.9,1
...,...,...,...,...
505,322,49,84.7,1
506,451,49,82.6,1
507,453,49,84.1,1
508,62,50,94.2,1


In [10]:
df3.dtypes

City                      int32
State                     int32
Cost of Living Index    float64
outcome                   int32
dtype: object

In [11]:
# Seperate the features (x = input) from the Targes (y = outcome)
y = df3['outcome']
X = df3.drop(columns='outcome')

In [12]:
# Split out data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(382, 3)

In [13]:
# Create a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import tree
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [14]:
# Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [15]:
# Make Predictions
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,1,1
2,1,1
3,0,0
4,1,1
5,0,0
6,1,1
7,1,1
8,1,1
9,1,1


In [16]:
# Import Accuracy Score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.984375
