In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
import matplotlib.pyplot as plot
# we can use the LabelEncoder to encode the gender feature
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# importing two different imputation methods that take into consideration all the features when predicting the missing values
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.dummy import DummyClassifier

# oversample the minority class using SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter

np.random.seed(42)

In [None]:
#load the datasets
SR_main = pd.read_csv('/updated_tables/SR_fact_table.csv')
country_demo = pd.read_csv('/updated_tables/country_demographics.csv')
crime_rate = pd.read_csv('/updated_tables/crime_rate.csv')
employment = pd.read_csv('/updated_tables/employement.csv')
sub_abuse = pd.read_csv('/updated_tables/substance_abuse.csv')
disease = pd.read_csv('/updated_tables/disease.csv')
#merge data frames
data_frames = [SR_main, country_demo, crime_rate,employment,sub_abuse]

df_merge = pd.merge(SR_main, country_demo, how='left', left_on=['CountryKey'], right_on=['country_key'])
df_merge = pd.merge(df_merge, crime_rate, how='left', left_on=['CountryKey'], right_on=['crime_key'])
df_merge = pd.merge(df_merge, employment, how='left', left_on=['CountryKey'], right_on=['employement_key'])
df_merge = pd.merge(df_merge, sub_abuse, how='left', left_on=['CountryKey'], right_on=['substance_key'])
df_merge = pd.merge(df_merge, disease, how='left', left_on=['CountryKey'], right_on=['disease_key'])

df_merge.head(5)
df = df_merge[['CountryKey','sr_year','country_name','average_age','suicide_rate','unemployement_rate','pct_drug_use','pct_tobacco_use','pct_alcohol_disorder']]
df.fillna(0,inplace=True)
#,'pct_drug_use','pct_tobacco_use','pct_alcohol_disorder']]

print(df.head(20))

#scatter plot

df.plot(kind = 'scatter', x = 'country_name', y = 'suicide_rate')
df.plot(kind='hist', x = 'country_name', y = 'suicide_rate')
df.plot(kind = 'box')

#description
print(df.describe())

In [None]:
# separate the features and the labels to be used in model development (2)
data = df.drop(["country_name"], axis=1) 
labels = df[["suicide_rate"]].copy()


incomplete_rows = data.isnull()
sample_incomplete_rows = data[data.isnull().any(axis=1)].head()
print(incomplete_rows.sum())
print(sample_incomplete_rows)

# data imputation
# given the task in predicting individuals with hepatitis C infection, select two of the most appropriate imputation strategies to fill the missing values and briefly explain why you have selected the particular strategies in a markdown cell below the current cell (3)
imputer_simple = SimpleImputer(strategy='median')
imputer_knn = KNNImputer(n_neighbors=5)
imputer_iter = IterativeImputer(max_iter=10)

# print the rows before and after being imputed with the two selected strategies (5)
missing_data =["suicide_rate"]
missing_indexes = sample_incomplete_rows = data[data.isnull().any(axis=1)].index
#Median
train_median = data
train_median[missing_data] = imputer_simple.fit_transform(train_median[missing_data])
print("Median imputed data: \n",train_median.loc[train_median.index.isin(missing_indexes)])


#KNN
train_knn = data
train_knn[missing_data] = imputer_knn.fit_transform(train_knn[missing_data])
print("KNN imputed data: \n",train_knn.loc[train_knn.index.isin(missing_indexes)])

# check for missing values in the training dataset and print how many rows can be identified with the missing values (1)

data = train_knn
data = df.drop(["suicide_rate","country_name"], axis=1) 
labels = df[["suicide_rate"]].copy()
X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=42)

In [None]:
num_pipeline  = Pipeline([('imputer',KNNImputer(n_neighbors=5)),('scaler',StandardScaler())])
num_features = list(X_train)

full_pipeline = ColumnTransformer([("num",num_pipeline,num_features)])

prepared_data = full_pipeline.fit_transform(X_train)
print(prepared_data)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

print(y_train)

dtr = DecisionTreeRegressor()
dtr.fit(prepared_data, y_train)

dtr_predict = dtr.predict(prepared_data)

dtr_mse = mean_squared_error(y_train, dtr_predict)
dtr_mse = np.sqrt(dtr_mse)
print(dtr_mse)

#linear 
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(prepared_data, y_train)

print(regr.predict(prepared_data))

#gradient
reg = GradientBoostingRegressor(random_state=0)
reg.fit(prepared_data, y_train)

In [None]:
prepared_test_data = full_pipeline.fit_transform(X_test)

prediction_DTR = dtr.predict(prepared_test_data)
prediction_REG = regr.predict(prepared_test_data)
prediction_GB = reg.predict(prepared_test_data)

# print(f"DTR Classification report:\n {classification_report(y_test, prediction_DTR)}")
print(f"DTR confusion matrix:\n {confusion_matrix(y_test,prediction_DTR)}")

print(f"REG Classification report:\n {classification_report(y_test, prediction_REG)}")
print(f"REG confusion matrix:\n {confusion_matrix(y_test,prediction_REG)}")

print(f"GB Classification report:\n {classification_report(y_test, prediction_GB)}")
print(f"GB confusion matrix:\n {confusion_matrix(y_test,prediction_GB)}")

ValueError: ignored