# Importing libraries

In [190]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Read data

In [156]:
data = pd.read_csv('50_Startups.csv')
data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [141]:
data['State'].value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [142]:
data.describe(include='all')

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
count,50.0,50.0,50.0,50,50.0
unique,,,,3,
top,,,,New York,
freq,,,,17,
mean,73721.6156,121344.6396,211025.0978,,112012.6392
std,45902.256482,28017.802755,122290.310726,,40306.180338
min,0.0,51283.14,0.0,,14681.4
25%,39936.37,103730.875,129300.1325,,90138.9025
50%,73051.08,122699.795,212716.24,,107978.19
75%,101602.8,144842.18,299469.085,,139765.9775


In [143]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


# Data preprocessing

* There is a column of type object, and each value in it indicates the name of the state

* Either we remove this column, or we replace each state's value with it's own zip code

In [144]:
# file include every state in USA with it's own ZIP code
US_cities = pd.read_csv('geo-data.csv')

In [145]:
US_cities

Unnamed: 0,state_fips,state,state_abbr,zipcode,county,city
0,1,Alabama,AL,35004,St. Clair,Acmar
1,1,Alabama,AL,35005,Jefferson,Adamsville
2,1,Alabama,AL,35006,Jefferson,Adger
3,1,Alabama,AL,35007,Shelby,Keystone
4,1,Alabama,AL,35010,Tallapoosa,New site
...,...,...,...,...,...,...
33098,56,Wyoming,WY,83126,Lincoln,Smoot
33099,56,Wyoming,WY,83127,Lincoln,Thayne
33100,56,Wyoming,WY,83128,Lincoln,Alpine
33101,56,Wyoming,WY,831HH,Lincoln,Zcta 831hh


In [146]:
# i will select only what i need 'state, zipcode'
code_state = US_cities[['state', 'zipcode']]

In [147]:
# select only the first ZIP code for each state, drop the rest then reset the index to start from zero
code_state = pd.DataFrame(code_state)
code_state = code_state.drop_duplicates(subset='state', keep='first').reset_index()

In [148]:
# drop the old index
code_state = code_state.drop(columns='index')
code_state

Unnamed: 0,state,zipcode
0,Alabama,35004
1,Alaska,99501
2,Arizona,84536
3,Arkansas,38041
4,California,89439
5,Colorado,80002
6,Connecticut,6001
7,Delaware,19701
8,District of columbia,20001
9,Florida,32008


In [164]:
# check duplicated data
code_state.value_counts()

state                 zipcode
Alabama               35004      1
Pennsylvania          15001      1
Nevada                89001      1
New hampshire         03031      1
New jersey            07001      1
New mexico            79922      1
New york              06390      1
North carolina        27006      1
North dakota          57638      1
Ohio                  43001      1
Oklahoma              67950      1
Oregon                97001      1
Rhode island          02804      1
Montana               59001      1
South carolina        29001      1
South dakota          56219      1
Tennessee             37010      1
Texas                 73949      1
Utah                  84001      1
Vermont               05001      1
Virginia              20105      1
Washington            98001      1
West virginia         24701      1
Wisconsin             53001      1
Nebraska              68001      1
Missouri              51630      1
Alaska                99501      1
Idaho                 831

In [191]:
# function to replace each state of the original data with it's own ZIP code then return  the new data 
def replace_states(original_data, data_assestant):
    original_data_copy = original_data[:]
    for index1, i in enumerate(data_assestant['state']):
        for index2, value in enumerate(original_data_copy['State']):
            if i.title() == value.title():
                original_data_copy.loc[:,'State'][index2] = data_assestant['zipcode'][index1]
    return original_data_copy

In [192]:
final_data = replace_states(data, code_state)

In [193]:
final_data['State'].value_counts()

89439    17
06390    17
32008    16
Name: State, dtype: int64

In [194]:
final_data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,6390,192261.83
1,162597.7,151377.59,443898.53,89439,191792.06
2,153441.51,101145.55,407934.54,32008,191050.39
3,144372.41,118671.85,383199.62,6390,182901.99
4,142107.34,91391.77,366168.42,32008,166187.94


# Train Test Splitting

In [182]:
features = final_data.iloc[:, :-1].to_numpy().reshape(len(final_data), len(final_data.columns) - 1)
target = final_data.iloc[:, -1].to_numpy().reshape(len(final_data), 1)

In [183]:
features.shape, target.shape

((50, 4), (50, 1))

In [185]:
x_train, x_test, y_train, y_test = train_test_split(features, target, random_state=42, test_size=1/3)

In [186]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((33, 4), (17, 4), (33, 1), (17, 1))

# Evaluate Model

In [187]:
LR = LinearRegression()
LR = LR.fit(x_train, y_train)
y_predict = LR.predict(x_test)

# Accuracy

In [189]:
print(r2_score(y_test, y_predict))
score = LR.score(x_test, y_test)
print("R-squared:", score)
print("MSE:", mean_squared_error(y_true=y_test, y_pred=y_predict))

0.9486019758007465
R-squared: 0.9486019758007465
MSE: 73135299.29774594
