In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
#from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [4]:
df = pd.read_csv('subAfricaDf.csv')

In [5]:
# get categorical columns in train dataset with missing values and store in missing_cat_cols
list_missing_cat_columns = list((df.select_dtypes(['object']).isna().sum() > 0).index)
list_missing_cat_columns

['Country',
 'State',
 'City',
 'Region_name',
 'AttackType',
 'Target',
 'Summary',
 'Group',
 'Target_type',
 'Weapon_type',
 'sub_weapon_type']

In [6]:
# Fill Categorical columns in data with mode
for col in list_missing_cat_columns:
    df[col] = df[col].fillna('Unknown')

In [7]:
## Fill missing values for numeric columns
# get numeric columns with missing values and store in lst_missing_numeric_col
list_missing_numeric_col = list((df.select_dtypes(np.number).isna().sum() > 0).index)
list_missing_numeric_col

['Year',
 'Month',
 'Day',
 'Region',
 'duration_of_incident',
 'Killed',
 'Wounded',
 'Latitude',
 'Longitude',
 'Suicide',
 'Casualities',
 'Success']

In [8]:
# Fill numeric columns in train dataset with mean
for col in list_missing_numeric_col:
    df[col] = df[col].fillna(df[col].median())

In [9]:
df1 = df.copy()

In [10]:
X= df1.drop(['Success','duration_of_incident','Summary','Latitude','State','Longitude','sub_weapon_type','Year', 'Month', 'Day', 'Casualities','City', 'Region', 'Region_name','Killed', 'Wounded','Target'],
            axis=1)
y= df1['Success']

In [11]:
cat_cols=['Group','Weapon_type','Country','AttackType','Target_type']

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

for col in cat_cols:
    X[col] = le.fit_transform(X[col])

In [12]:
'''np.bincount (y) 
ros = RandomOverSampler (random_state=2)
X, y = ros.fit_resample (X,y)
np.bincount (y) '''

'np.bincount (y) \nros = RandomOverSampler (random_state=2)\nX, y = ros.fit_resample (X,y)\nnp.bincount (y) '

In [13]:
RandomForestClassifier(random_state = 1)

RandomForestClassifier(random_state=1)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [15]:
reg_rf = RandomForestClassifier()
reg_rf.fit(X_train, y_train)

RandomForestClassifier()

In [16]:
y_pred = reg_rf.predict(X_test)

In [17]:
reg_rf.score(X_train, y_train)

0.947934472934473

In [18]:
reg_rf.score(X_test, y_test)

0.9319088319088319

In [19]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 0.0680911680911681
MSE: 0.0680911680911681
RMSE: 0.26094284449121824


In [20]:
metrics.r2_score(y_test, y_pred)

-0.036977611202310134

In [21]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [22]:
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [23]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid,scoring='neg_mean_squared_error',n_iter = 10,cv = 5,random_state = 1, n_jobs =1)

In [24]:
rf_random.fit(X_train,y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=1, scoring='neg_mean_squared_error')

In [25]:
X.shape

(17550, 6)

In [26]:
rf_random.best_params_

{'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 5,
 'max_features': 'auto',
 'max_depth': 15}

In [27]:
prediction = rf_random.predict(X_test)

In [28]:
import pickle
# open a file, where you ant to store the data
file = open('terrorist_rf2.pkl', 'wb')

# dump information to that file
pickle.dump(rf_random, file)
pickle.dump(rf_random, file)

In [29]:
model = open('terrorist_rf2.pkl','rb')
forest = pickle.load(model)

In [30]:
y_prediction = forest.predict(X_test)

In [31]:
metrics.r2_score(y_test, y_prediction)

0.015088210280650993