## Importing all the required libraries

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.multiclass import OneVsRestClassifier

### Reading the training dataset to Pandas DataFrame

In [16]:
data = pd.read_csv('new_appended.csv')
data.head()

Unnamed: 0,Accident_ID,Accident_Type_Code,Adverse_Weather_Metric,Cabin_Temperature,Control_Metric,Days_Since_Inspection,Max_Elevation,Safety_Score,Severity,Total_Safety_Complaints,Turbulence_In_gforces,Violations
0,7570.0,2.0,0.424352,78.04,71.285324,14.0,31335.476824,49.223744,Minor_Damage_And_Injuries,22.0,0.272118,3.0
1,12128.0,2.0,0.35235,84.54,72.288058,10.0,26024.711057,62.465753,Minor_Damage_And_Injuries,27.0,0.423939,2.0
2,2181.0,7.0,0.003364,78.86,66.362808,13.0,39269.053927,63.059361,Significant_Damage_And_Fatalities,16.0,0.322604,3.0
3,5946.0,3.0,0.211728,81.79,74.703737,11.0,42771.4992,48.082192,Significant_Damage_And_Serious_Injuries,9.0,0.337029,1.0
4,9054.0,3.0,0.176883,77.16,47.948952,13.0,35509.228515,26.484018,Significant_Damage_And_Fatalities,25.0,0.54114,2.0


### Getting the target variables to Y variable

In [17]:
Y = data['Severity']
Y.shape

(12245,)

### Dropoing the irrelevent columns from training data

In [18]:
data = data.drop(columns=['Severity','Accident_ID','Adverse_Weather_Metric','Accident_Type_Code'],axis=1)
data.columns

Index(['Cabin_Temperature', 'Control_Metric', 'Days_Since_Inspection',
       'Max_Elevation', 'Safety_Score', 'Total_Safety_Complaints',
       'Turbulence_In_gforces', 'Violations'],
      dtype='object')

### creating the Label Encoder object which will encode the target severities to numerical form

In [19]:
label_encode = LabelEncoder()
y = label_encode.fit_transform(Y)

In [20]:
x_train,x_test,y_train,y_test = train_test_split(data,y,test_size=0.2)

In [21]:
from xgboost import XGBClassifier

In [22]:
e_set=[(x_train, y_train), (x_test, y_test)]

xgb = XGBClassifier(n_estimators=500,learning_rate=0.1,max_depth=10,reg_lambda=0.1,importance_type='total_gain',
                base_score=0.1,min_child_weight=0.1,scale_pos_weight=0.5)

In [23]:
#xgb.fit(x_train,y_train)

In [24]:
# predictions = xgb.predict(x_test)
# acc = 100 * (f1_score(y_test,predictions,average='weighted'))
# acc

In [25]:
dtc = DecisionTreeClassifier(criterion='entropy',max_depth=10,class_weight='balanced')
bag = BaggingClassifier(n_estimators=150,oob_score=True, bootstrap_features=True,base_estimator=dtc)

In [26]:
rfc = RandomForestClassifier(n_estimators=150,criterion='entropy',max_depth=10,oob_score=True,class_weight='balanced')

In [34]:
voting = VotingClassifier(estimators=[
        ('lr', bag), ('rf', xgb), ('gnb', rfc)], voting='soft')
voting.fit(data,y)

VotingClassifier(estimators=[('lr',
                              BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                      class_weight='balanced',
                                                                                      criterion='entropy',
                                                                                      max_depth=10,
                                                                                      max_features=None,
                                                                                      max_leaf_nodes=None,
                                                                                      min_impurity_decrease=0.0,
                                                                                      min_impurity_split=None,
                                                                                      min_samples_leaf=1,
   

In [35]:
predictions = voting.predict(x_test)
acc = 100 * (f1_score(y_test,predictions,average='weighted'))
acc

100.0

In [36]:
test_data = pd.read_csv('test.csv')
accident_id = test_data['Accident_ID']
print(test_data.shape)
test_data = test_data.drop(columns=['Accident_ID','Adverse_Weather_Metric','Accident_Type_Code'],axis=1)
cols = ['Cabin_Temperature', 'Control_Metric', 'Days_Since_Inspection',
       'Max_Elevation', 'Safety_Score', 'Total_Safety_Complaints',
       'Turbulence_In_gforces', 'Violations']
test_data = test_data[cols]
predictions = voting.predict(test_data)
predictions = label_encode.inverse_transform(predictions)
result_df = pd.DataFrame({'Accident_ID':accident_id,'Severity':predictions})
result_df.to_csv('Prediction.csv',index=False)

(2500, 11)


### Accuracy - 85.81

In [33]:
t1 = pd.read_csv('Prediction_xgb1.csv')
t2 = pd.read_csv('Prediction_xgb2.csv')
t3 = pd.read_csv('Prediction_xgb3.csv')
t4 = pd.read_csv('Prediction_xgb4.csv')

count = 0
row = []
for i in range(len(t1)):
    if t4.iloc[i]['Severity'] == t1.iloc[i]['Severity']:
        if t4.iloc[i]['Severity'] == t2.iloc[i]['Severity']:
            if t4.iloc[i]['Severity'] == t3.iloc[i]['Severity']:
                count += 1
                row.append(i)
        
count

2416

In [2]:
import random

In [190]:
for i in range(1500):
    row.pop(random.randrange(len(row)))
    
len(row)

944