In [212]:
# Import dependencies
import pandas as pd
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

## Pull and Clean the Data 

In [213]:
df = pd.read_csv("Resources/Levels_Fyi_Salary_Data.csv")
df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,location.1,Unnamed: 30,Unnamed: 31,Unnamed: 32
0,6/7/17 11:33,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,,107000,...,0,0,0,0,,,Redwood City,CA,,
1,6/10/17 17:11,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,,0,...,0,0,0,0,,,San Francisco,CA,,
2,6/11/17 14:53,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,,155000,...,0,0,0,0,,,Seattle,WA,,
3,6/17/17 0:23,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,,157000,...,0,0,0,0,,,Sunnyvale,CA,,
4,6/20/17 10:58,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,,0,...,0,0,0,0,,,Mountain View,CA,,


In [214]:
# Remove nulls
df = df[df['Unnamed: 32'].isnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62618 entries, 0 to 62641
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                62618 non-null  object 
 1   company                  62613 non-null  object 
 2   level                    62499 non-null  object 
 3   title                    62618 non-null  object 
 4   totalyearlycompensation  62618 non-null  int64  
 5   location                 62618 non-null  object 
 6   yearsofexperience        62618 non-null  float64
 7   yearsatcompany           62618 non-null  float64
 8   tag                      61764 non-null  object 
 9   basesalary               62618 non-null  int64  
 10  stockgrantvalue          62618 non-null  float64
 11  bonus                    62618 non-null  float64
 12  gender                   43083 non-null  object 
 13  otherdetails             40120 non-null  object 
 14  cityid                

In [215]:
# Remove nulls
df = df[df['Unnamed: 31'].isnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52840 entries, 0 to 62641
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                52840 non-null  object 
 1   company                  52837 non-null  object 
 2   level                    52747 non-null  object 
 3   title                    52840 non-null  object 
 4   totalyearlycompensation  52840 non-null  int64  
 5   location                 52840 non-null  object 
 6   yearsofexperience        52840 non-null  float64
 7   yearsatcompany           52840 non-null  float64
 8   tag                      52023 non-null  object 
 9   basesalary               52840 non-null  int64  
 10  stockgrantvalue          52840 non-null  float64
 11  bonus                    52840 non-null  float64
 12  gender                   35737 non-null  object 
 13  otherdetails             32641 non-null  object 
 14  cityid                

In [216]:
# Review the education column to see distinct values
df['Education'].value_counts()

Master's Degree      12901
Bachelor's Degree     9473
PhD                   1546
Some College           295
Highschool             207
Name: Education, dtype: int64

In [217]:
#Create a function to group education data into less bins
def education(value):
    if value == "Bachelor's Degree":
        return "College"
    elif value == "Highschool" or value == "Some College":
        return "High School"
    else:
        return value
 
df['education_mapped'] = df['Education'].map(education)
df['education_mapped'].value_counts()

Master's Degree    12901
College             9473
PhD                 1546
High School          502
Name: education_mapped, dtype: int64

In [218]:
# Review the Race column to see distinct values
df['Race'].value_counts()

Asian          8953
White          6390
Hispanic        957
Two Or More     673
Black           620
Name: Race, dtype: int64

In [219]:
# Create a function to group race data
def race(value):
    if value == 'White':
        return "White"
    if value == 'Asian':
        return "Asian"
    else:
        return "Minority"
 
df['race_mapped'] = df['Race'].map(race)
df['race_mapped'].value_counts()

Minority    37497
Asian        8953
White        6390
Name: race_mapped, dtype: int64

In [220]:
# Review the Title column to see distinct values
df['title'].value_counts()

Software Engineer               34245
Product Manager                  4185
Software Engineering Manager     3043
Data Scientist                   2227
Hardware Engineer                1968
Product Designer                 1349
Technical Program Manager        1234
Solution Architect                893
Management Consultant             814
Business Analyst                  739
Marketing                         630
Mechanical Engineer               456
Recruiter                         406
Sales                             337
Human Resources                   314
Name: title, dtype: int64

In [221]:
# Create a function to group title
def title(value):
    if value == 'Software Engineer': 
        return "Software Engineering"
    #or value == 'Solution Architect':
#         return "Software Engineering"
#     elif value == 'Product Manager' or value == 'Technical Program Manager':
#         return "Product Manager"
#     elif value == 'Mechanical Engineer':
#         return "Mechanical Engineer"
#     elif value == 'Hardware Engineer':
#         return "Hardware Engineer"
#     elif value == 'Management Consultant' or value == 'Business Analyst':
#         return "Business Analyst"
#     elif value == 'Marketing' or value == 'Sales':
#         return "Marketing / Sales"
#     elif value == 'Data Scientist':
#         return "Data Scientist"
#     elif value == 'Recruiter':
#         return "Recruiter"
#     elif value == 'Sales' or value == 'Marketing':
#         return "Sales/Marketing"
#     elif value == 'Human Resources':
#         return "Human Resources"
    else:
        return value
#         return "Product Designer"
 
df['title_mapped'] = df['title'].map(title)
df['title_mapped'].value_counts()

Software Engineering            34245
Product Manager                  4185
Software Engineering Manager     3043
Data Scientist                   2227
Hardware Engineer                1968
Product Designer                 1349
Technical Program Manager        1234
Solution Architect                893
Management Consultant             814
Business Analyst                  739
Marketing                         630
Mechanical Engineer               456
Recruiter                         406
Sales                             337
Human Resources                   314
Name: title_mapped, dtype: int64

In [222]:
# Review the YOE column to see distinct values
df['yearsofexperience'].value_counts()

5.00     4900
2.00     4719
3.00     4661
4.00     4118
0.00     4063
         ... 
69.00       1
0.58        1
0.60        1
1.40        1
10.50       1
Name: yearsofexperience, Length: 65, dtype: int64

In [223]:
# Create a function to group YOE
def title(value):
    if value <= 1:
        return "1 or less"
    elif value <= 2:
        return "Between 1 and 2"
    elif value <= 3:
        return "Between 2 and 3"
    elif value <= 4:
        return "Between 3 and 4"
    elif value <= 5:
        return "Between 4 and 5"
    elif value <= 6:
        return "Between 5 and 6"
    elif value <= 7:
        return "Between 6 and 7"
    elif value <= 8:
        return "Between 7 and 8"
    elif value <= 9:
        return "Between 8 and 9"
    elif value <= 10:
        return "Between 9 and 10"
    elif value <= 11:
        return "Between 10 and 11"
    elif value <= 12:
        return "Between 11 and 12"
    elif value <= 13:
        return "Between 12 and 13"
    elif value <= 14:
        return "Between 13 and 14"
    elif value <= 15:
        return "Between 14 and 15"
    elif value <= 16:
        return "Between 15 and 16"
    elif value <= 17:
        return "Between 16 and 17"
    elif value <= 18:
        return "Between 17 and 18"
    elif value <= 19:
        return "Between 18 and 19"
    elif value <= 20:
        return "Between 19 and 20"
    elif value <= 21:
        return "Between 20 and 21"
    elif value <= 22:
        return "Between 21 and 22"
    elif value <= 23:
        return "Between 22 and 23"
    elif value <= 24:
        return "Between 23 and 24"
    elif value <= 25:
        return "Between 24 and 25"
    elif value <= 26:
        return "Between 25 and 26"
    elif value <= 27:
        return "Between 26 and 27"
    elif value <= 28:
        return "Between 27 and 28"
    elif value <= 29:
        return "Between 28 and 29"
    elif value <= 30:
        return "Between 29 and 30"
    elif value <= 31:
        return "Between 30 and 31"
    elif value <= 32:
        return "Between 31 and 32"
    elif value <= 33:
        return "Between 32 and 33"
    elif value <= 34:
        return "Between 33 and 34"
    elif value <= 35:
        return "Between 34 and 35"
    else:
        return "35+"   

df['years_of_experience_mapped'] = df['yearsofexperience'].map(title)
df['years_of_experience_mapped'].value_counts()

1 or less            7517
Between 4 and 5      4902
Between 1 and 2      4731
Between 2 and 3      4673
Between 3 and 4      4124
Between 9 and 10     3939
Between 5 and 6      3322
Between 7 and 8      2968
Between 6 and 7      2785
Between 14 and 15    2541
Between 11 and 12    1823
Between 19 and 20    1714
Between 8 and 9      1673
Between 13 and 14    1011
Between 12 and 13     954
Between 10 and 11     939
Between 15 and 16     661
Between 17 and 18     569
Between 16 and 17     419
Between 24 and 25     368
Between 18 and 19     254
Between 21 and 22     204
Between 20 and 21     168
Between 22 and 23     133
Between 29 and 30     108
Between 23 and 24     106
Between 25 and 26      46
Between 26 and 27      38
Between 27 and 28      37
35+                    33
Between 34 and 35      27
Between 28 and 29      20
Between 32 and 33      12
Between 31 and 32      11
Between 30 and 31       6
Between 33 and 34       4
Name: years_of_experience_mapped, dtype: int64

In [224]:
# Review the state column to see distinct values
df['Unnamed: 30'].value_counts()

 CA        22824
 WA        12353
 NY         4715
 TX         2702
 MA         1738
 VA          917
 IL          885
 OR          637
 DC          592
 CO          590
 GA          573
 NC          504
 PA          480
 NJ          460
 AZ          379
 FL          298
 MN          278
 UT          224
 MI          218
 OH          202
 MO          190
 WI          155
 CT          106
 MD          105
 IN          100
 AR           88
 TN           79
 DE           59
 ID           45
 KS           37
 NH           35
 LA           31
 IA           31
 AL           25
 NE           25
 KY           24
 NV           24
 SC           22
 OK           18
 RI           18
 WV           11
 NM           10
 VT            7
 MT            7
 ND            5
 ME            4
 HI            4
 MS            3
 Israel        2
 WY            1
Name: Unnamed: 30, dtype: int64

In [225]:
# Create a function to group States
state_list = [' CA',' WA', ' NY', ' TX', ' MA', ' VA', ' IL', ' OR', ' DC', ' CO', ' GA', ' NC', ' PA', ' NJ',' AZ', ' FL', ' MN', ' UT', ' MI', ' OH', 'MO']
def state(value):
    if value in state_list:
        return value
    else:
        return "Other"

     
df['state_mapped'] = df['Unnamed: 30'].map(state)
df['state_mapped'].value_counts()

 CA      22824
 WA      12353
 NY       4715
 TX       2702
 MA       1738
Other     1271
 VA        917
 IL        885
 OR        637
 DC        592
 CO        590
 GA        573
 NC        504
 PA        480
 NJ        460
 AZ        379
 FL        298
 MN        278
 UT        224
 MI        218
 OH        202
Name: state_mapped, dtype: int64

In [226]:
# Review the base salary column to see distinct values
df['basesalary'].value_counts()

160000    2755
150000    2179
0         2061
130000    1613
140000    1586
          ... 
281000       1
489000       1
484000       1
608000       1
194688       1
Name: basesalary, Length: 455, dtype: int64

In [227]:
# Create a function to group base salaries
# Define the bin intervals and labels
start_value = 60000
end_value = 700000
bin_increment = 15000

# Create the bin edges
bin_edges = list(range(start_value, end_value + bin_increment, bin_increment))
bin_labels = [f'${value}-{value+bin_increment-1}k' for value in bin_edges[:-1]]
bin_labels.append('$250000+')

# Create the 'Salary_Range' column
df['salary_mapped'] = pd.cut(df['basesalary'], bins=bin_edges, labels=bin_labels[:-1], right=False)

# Update the categories of the 'Salary_Range' column
bin_labels.insert(0, '<$60000')
df['salary_mapped'] = pd.Categorical(df['salary_mapped'], categories=bin_labels)

# Assign "<$60000" label to values less than $60000
df.loc[df['basesalary'] < start_value, 'salary_mapped'] = '<$60000'
df.loc[df['basesalary'] > end_value, 'salary_mapped'] = '$250000+'

df['basesalary'].value_counts()

160000    2755
150000    2179
0         2061
130000    1613
140000    1586
          ... 
281000       1
489000       1
484000       1
608000       1
194688       1
Name: basesalary, Length: 455, dtype: int64

In [228]:
# Drop unneeded columns
df_final = df.drop(columns=['timestamp', 'company', 'level','location', 'tag', 'Unnamed: 31','Unnamed: 32','Masters_Degree', 'Bachelors_Degree', 'Doctorate_Degree','Highschool','Some_College','Race_Asian','Race_White','Race_Two_Or_More','Race_Black','Race_Hispanic', 'otherdetails', 'dmaid', 'rowNumber', 'stockgrantvalue', 'bonus', 'Unnamed: 30', 'title', 'yearsatcompany', 'yearsofexperience', 'location.1' , 'cityid'])
df_final

Unnamed: 0,totalyearlycompensation,basesalary,gender,Race,Education,education_mapped,race_mapped,title_mapped,years_of_experience_mapped,state_mapped,salary_mapped
0,127000,107000,,,,,Minority,Product Manager,Between 1 and 2,CA,$105000-119999k
1,100000,0,,,,,Minority,Software Engineering,Between 4 and 5,CA,<$60000
2,310000,155000,,,,,Minority,Product Manager,Between 7 and 8,WA,$150000-164999k
3,372000,157000,,,,,Minority,Software Engineering Manager,Between 6 and 7,CA,$150000-164999k
4,157000,0,,,,,Minority,Software Engineering,Between 4 and 5,CA,<$60000
...,...,...,...,...,...,...,...,...,...,...,...
62637,327000,155000,,,,,Minority,Software Engineering,Between 9 and 10,WA,$150000-164999k
62638,237000,146900,,,,,Minority,Software Engineering,Between 1 and 2,WA,$135000-149999k
62639,220000,157000,,,,,Minority,Software Engineering,Between 13 and 14,WA,$150000-164999k
62640,280000,194688,,,,,Minority,Software Engineering,Between 7 and 8,CA,$180000-194999k


In [229]:
# Drop nulls from dataframe
df_final = df_final.dropna(subset=['Race', 'Education', 'gender', 'salary_mapped'])
df_final

Unnamed: 0,totalyearlycompensation,basesalary,gender,Race,Education,education_mapped,race_mapped,title_mapped,years_of_experience_mapped,state_mapped,salary_mapped
15710,400000,210000,Male,Asian,PhD,PhD,Asian,Software Engineering,Between 4 and 5,CA,$210000-224999k
23532,136000,124000,Male,Two Or More,Bachelor's Degree,College,Minority,Software Engineering,Between 2 and 3,WA,$120000-134999k
23533,337000,177000,Male,Asian,Bachelor's Degree,College,Asian,Software Engineering,Between 5 and 6,CA,$165000-179999k
23534,222000,164000,Male,Asian,Master's Degree,Master's Degree,Asian,Software Engineering,Between 3 and 4,WA,$150000-164999k
23535,187000,165000,Male,White,Bachelor's Degree,College,White,Software Engineering,Between 4 and 5,CA,$165000-179999k
...,...,...,...,...,...,...,...,...,...,...,...
61981,1470000,290000,Male,Asian,Bachelor's Degree,College,Asian,Software Engineering Manager,Between 8 and 9,CA,$285000-299999k
61982,4500000,450000,Male,Asian,Master's Degree,Master's Degree,Asian,Product Manager,Between 19 and 20,CA,$450000-464999k
61984,1605000,250000,Female,White,Master's Degree,Master's Degree,White,Software Engineering Manager,Between 15 and 16,CO,$240000-254999k
61987,2372000,315000,Male,Black,Master's Degree,Master's Degree,Minority,Software Engineering Manager,Between 21 and 22,CA,$315000-329999k


In [230]:
# Rename columns
df_final = df_final.rename(columns={'location.1': 'city', 'Race': 'race', 'Education': 'education' })
df_final

Unnamed: 0,totalyearlycompensation,basesalary,gender,race,education,education_mapped,race_mapped,title_mapped,years_of_experience_mapped,state_mapped,salary_mapped
15710,400000,210000,Male,Asian,PhD,PhD,Asian,Software Engineering,Between 4 and 5,CA,$210000-224999k
23532,136000,124000,Male,Two Or More,Bachelor's Degree,College,Minority,Software Engineering,Between 2 and 3,WA,$120000-134999k
23533,337000,177000,Male,Asian,Bachelor's Degree,College,Asian,Software Engineering,Between 5 and 6,CA,$165000-179999k
23534,222000,164000,Male,Asian,Master's Degree,Master's Degree,Asian,Software Engineering,Between 3 and 4,WA,$150000-164999k
23535,187000,165000,Male,White,Bachelor's Degree,College,White,Software Engineering,Between 4 and 5,CA,$165000-179999k
...,...,...,...,...,...,...,...,...,...,...,...
61981,1470000,290000,Male,Asian,Bachelor's Degree,College,Asian,Software Engineering Manager,Between 8 and 9,CA,$285000-299999k
61982,4500000,450000,Male,Asian,Master's Degree,Master's Degree,Asian,Product Manager,Between 19 and 20,CA,$450000-464999k
61984,1605000,250000,Female,White,Master's Degree,Master's Degree,White,Software Engineering Manager,Between 15 and 16,CO,$240000-254999k
61987,2372000,315000,Male,Black,Master's Degree,Master's Degree,Minority,Software Engineering Manager,Between 21 and 22,CA,$315000-329999k


In [231]:
# Finalize the df with only needed columns
df_final = df_final[['basesalary','salary_mapped', 'gender', 'education_mapped', 'race_mapped', 'title_mapped', 'years_of_experience_mapped', 'state_mapped']]
df_final

Unnamed: 0,basesalary,salary_mapped,gender,education_mapped,race_mapped,title_mapped,years_of_experience_mapped,state_mapped
15710,210000,$210000-224999k,Male,PhD,Asian,Software Engineering,Between 4 and 5,CA
23532,124000,$120000-134999k,Male,College,Minority,Software Engineering,Between 2 and 3,WA
23533,177000,$165000-179999k,Male,College,Asian,Software Engineering,Between 5 and 6,CA
23534,164000,$150000-164999k,Male,Master's Degree,Asian,Software Engineering,Between 3 and 4,WA
23535,165000,$165000-179999k,Male,College,White,Software Engineering,Between 4 and 5,CA
...,...,...,...,...,...,...,...,...
61981,290000,$285000-299999k,Male,College,Asian,Software Engineering Manager,Between 8 and 9,CA
61982,450000,$450000-464999k,Male,Master's Degree,Asian,Product Manager,Between 19 and 20,CA
61984,250000,$240000-254999k,Female,Master's Degree,White,Software Engineering Manager,Between 15 and 16,CO
61987,315000,$315000-329999k,Male,Master's Degree,Minority,Software Engineering Manager,Between 21 and 22,CA


## Linear Regression

In [232]:
# Linear Regression
# Define variables
# Didn't scale the data due to the # of booleans we have in the data set
y = df_final['basesalary']
X = pd.get_dummies(df_final[['gender', 'education_mapped', 'race_mapped', 'title_mapped', 'years_of_experience_mapped']])
X

Unnamed: 0,gender_Female,gender_Male,gender_Other,education_mapped_College,education_mapped_High School,education_mapped_Master's Degree,education_mapped_PhD,race_mapped_Asian,race_mapped_Minority,race_mapped_White,...,years_of_experience_mapped_Between 30 and 31,years_of_experience_mapped_Between 31 and 32,years_of_experience_mapped_Between 32 and 33,years_of_experience_mapped_Between 34 and 35,years_of_experience_mapped_Between 4 and 5,years_of_experience_mapped_Between 5 and 6,years_of_experience_mapped_Between 6 and 7,years_of_experience_mapped_Between 7 and 8,years_of_experience_mapped_Between 8 and 9,years_of_experience_mapped_Between 9 and 10
15710,0,1,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0
23532,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
23533,0,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
23534,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
23535,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61981,0,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
61982,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
61984,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
61987,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [233]:
# Split between testing and training 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)


In [234]:
# Fit model and build predictions
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [235]:
# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared Score:", r2)

# R-squared value is .34 (not good)

Mean Squared Error: 1769330035.414289
R-squared Score: 0.3384914476064045


## PCA

In [236]:
# # Create an instance of PCA and look for 4 most relevant characters of the model 
# pca = PCA(n_components=4)

In [237]:
# # # Create a df removing the y / dependent variables
# final_df_target_removed = df_final.drop(columns = ['basesalary', 'totalyearlycompensation', 'title', 'race', 'education','cityid','yearsatcompany'])
# final_df_target_removed

In [238]:
# # Fit the PCA model on the transformed credit card DataFrame
# # final_df_pca = pca.fit_transform(pd.get_dummies(df_final))
# final_df_pca = pca.fit_transform(pd.get_dummies(final_df_target_removed))

# # Review the first 5 rows of the array of list data
# # Putting data into array makes it easy to analyze
# final_df_pca[:3]

In [239]:
# # Calculate the PCA explained variance ratio
# # Four components explain 98% of the variance found in the original model
# pca.explained_variance_ratio_

In [240]:
# # Create a PCA df
# final_df_pca = pd.DataFrame(
#     final_df_pca,
#     columns=["PCA1", "PCA2", "PCA3", "PCA4"]
# )

# # Review the PCA DataFrame
# final_df_pca.head()

## Random Forest

In [241]:
X = df_final.drop(columns=['basesalary', 'salary_mapped'])
y = df_final['basesalary']
X_encoded = pd.get_dummies(X)
X_encoded

Unnamed: 0,gender_Female,gender_Male,gender_Other,education_mapped_College,education_mapped_High School,education_mapped_Master's Degree,education_mapped_PhD,race_mapped_Asian,race_mapped_Minority,race_mapped_White,...,state_mapped_ NJ,state_mapped_ NY,state_mapped_ OH,state_mapped_ OR,state_mapped_ PA,state_mapped_ TX,state_mapped_ UT,state_mapped_ VA,state_mapped_ WA,state_mapped_Other
15710,0,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
23532,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
23533,0,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
23534,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
23535,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61981,0,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
61982,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
61984,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
61987,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [242]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_encoded, y)
y_pred = model.predict(X_encoded)
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.4441103643438274


In [243]:
# Over Sampling
ros = RandomOverSampler()
X_oversampled, y_oversampled = ros.fit_resample(X_encoded, y)

In [244]:
# Over Sampling w/ Random Forest
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6936087563477452


## Decision Tree

In [245]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
model = DecisionTreeClassifier()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict the target variable for the test set
y_pred = model.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.05658709106984969


In [246]:
# Run Oversampling w/ Decision Tree
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.693952697918142
