In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Read the CSV file into a DataFrame
data = pd.read_csv('master.csv')



In [17]:
# Drop rows with missing values
data = data.dropna()

# Select relevant features
features = data[['age', 'gdp_per_capita_Dollars', 'generation']]

In [18]:
features

Unnamed: 0,age,gdp_per_capita_Dollars,generation
0,15-24 years,796,Gen X
1,35-54 years,796,Silent
2,15-24 years,796,Gen X
3,75+ years,796,G.I Gen
4,25-34 years,796,Boomers
...,...,...,...
27815,35-54 years,2309,Gen X
27816,75+ years,2309,Silent
27817,5-14 years,2309,Gen Z
27818,5-14 years,2309,Gen Z


In [21]:
def encode_categorical_variables(df):
    # Identify categorical variables
    categorical_vars = df.select_dtypes(include=['object']).columns
    
    # One-hot encode categorical variables
    df_encoded = pd.get_dummies(df, columns=categorical_vars)
    
    return df_encoded


# Encode categorical variables
data_encoded = encode_categorical_variables(data)

# Select features and target
features = data_encoded.drop('suicides/100k', axis=1)
target = data_encoded['suicides/100k']

# Create a random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the data
rf.fit(features, target)

# Get the feature importance scores
importances = rf.feature_importances_

# Create a DataFrame to store the feature importance scores
feature_importances = pd.DataFrame({'Feature': features.columns, 'Importance': importances})

# Sort the DataFrame by importance in descending order
feature_importances = feature_importances.sort_values('Importance', ascending=False)

# Print the feature importance scores
print(feature_importances)

                                     Feature  Importance
1                                suicides_no    0.448986
2                                 population    0.438351
106                               sex_female    0.049629
107                                 sex_male    0.020659
80                country_Russian Federation    0.008725
...                                      ...         ...
1530             country-year_Montenegro2002    0.000000
1529             country-year_Montenegro2001    0.000000
1528             country-year_Montenegro2000    0.000000
991                 country-year_Grenada2001    0.000000
1888  country-year_Saint Kitts and Nevis1990    0.000000

[2441 rows x 2 columns]


In [22]:
from sklearn.model_selection import train_test_split

# Select the top three features
features = data[['suicides_no', 'population', 'sex']]

# One-hot encode the 'sex' column
features = pd.get_dummies(features, columns=['sex'])

# Select the target variable
target = data['suicides/100k']



In [27]:
features

Unnamed: 0,suicides_no,population,sex_female,sex_male
0,21,312900,False,True
1,16,308000,False,True
2,14,289700,True,False
3,1,21800,False,True
4,9,274300,False,True
...,...,...,...,...
27815,107,3620833,True,False
27816,9,348465,True,False
27817,60,2762158,False,True
27818,44,2631600,True,False


In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Make predictions on the test data
predictions = rf.predict(X_test)

In [25]:
# Evaluate the model's performance
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-Squared: {r2}")

Mean Squared Error: 1.4508637691121529
Mean Absolute Error: 0.22787670740474444
R-Squared: 0.9958482596419067


In [29]:
input_data = [[21, 312900, 1, 0]]  
prediction = rf.predict(input_data)
print(f"Predicted suicides/100k: {prediction[0]}")

Predicted suicides/100k: 6.703099999999997




In [30]:
import os
import pickle

if not os.path.exists('model'):
    os.makedirs('model')

# Save the trained model to a pickle file
with open('model/random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf, f)

In [34]:
#Testing whether it's working fine
with open('model/random_forest_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

input_data = [[21, 312900, 1, 0]]  
prediction = loaded_model.predict(input_data)
print(f"Predicted suicides/100k: {prediction[0]}")

Predicted suicides/100k: 6.703099999999997


