In [1]:
# Importing the necessary python libraries and modules required to build the model(s)
import pandas as pd
import hashlib
import numpy as np
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

from sklearn.model_selection import train_test_split


In [2]:
# Function to convert hashkey string values present in the data into numerical values 

def hash_to_number(hash_key):
    return hash(hash_key)

In [3]:
# Random Forest Classifier 
RFclassifier = LabelPowerset(
    classifier = RandomForestClassifier(),
    require_dense = [False,True]
)

In [4]:
# Decision Tree Classifier
DTclassifier = LabelPowerset(
    classifier = DecisionTreeClassifier(),
    require_dense = [False,True]
)

In [5]:
# Simple Neural Network
NNclassifier = MLPClassifier(max_iter=400)



In [6]:
#importing the train,test and training labels using pandas
df1 = pd.read_csv("train.csv")


In [7]:
df2 = pd.read_csv("test.csv",header = None)

In [8]:
df3 = pd.read_csv("trainLabels.csv")


In [9]:
# merging train and training label 
merged_df = pd.merge(df1, df3, on='id')

In [10]:

# replacing columns with YES/NO data with 1/0 and empty values with -1
merged_df.replace({'YES':1,'NO':0,np.nan:-1},inplace=True)
    
object_columns = merged_df.select_dtypes(include='object').columns

# applying the hash function to the rows containing hash strings
for column in object_columns:
    merged_df[column] = merged_df[column].apply(hash_to_number)
    
X = merged_df
y = merged_df
    
for column in merged_df.columns:
    if column.startswith('y'):
        X = X.drop(columns=column)
    elif column.startswith('x'):
        y = y.drop(columns=column)
X = X.drop(columns='id')
y = y.drop(columns='id')


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)


In [12]:
RFclassifier.fit(np.array(X_train), np.array(y_train))

In [13]:
predictions1 = RFclassifier.predict(X_test)

accuracy_score(y_test,predictions1)


0.823

In [14]:
hamming_loss(y_test, predictions1)

0.011196969696969697

In [15]:
NNclassifier.fit(np.array(X_train), np.array(y_train))

In [16]:
predictions2 = NNclassifier.predict(X_test)



In [17]:
accuracy_score(y_test,predictions2)

0.253

In [18]:
DTclassifier.fit(np.array(X_train), np.array(y_train))

In [28]:
predictions3 = DTclassifier.predict(X_test)

accuracy_score(y_test,predictions3)

0.723

In [20]:
# applying preprocessing for test data
merged_df2 = df2

merged_df2.replace({'YES':1,'NO':0,np.nan:-1},inplace=True)
    
object_columns = merged_df2.select_dtypes(include='object').columns

merged_df2 = merged_df2.drop(columns=0)


for column in object_columns:
    merged_df2[column] = merged_df2[column].apply(hash_to_number)


In [21]:
# Accuracy of Random Forest Classifier ~82%, Accuracy of Neural Network ~25% (with high variance),Accuracy of Decision Tree Classifier ~72%
# Hence Random Forest Classifier will be used with the entire training data to make prediction for the test data

In [22]:
RFclassifier.fit(np.array(X), np.array(y))

In [23]:
predictions4 = RFclassifier.predict(merged_df2)

print(predictions4)

  (0, 32)	1
  (1, 32)	1
  (2, 32)	1
  (3, 5)	1
  (3, 11)	1
  (4, 28)	1
  (5, 20)	1
  (6, 3)	1
  (7, 32)	1
  (8, 8)	1
  (9, 32)	1
  (10, 32)	1
  (11, 32)	1
  (12, 32)	1
  (13, 25)	1
  (14, 32)	1
  (15, 5)	1
  (15, 11)	1
  (16, 0)	1
  (17, 32)	1
  (18, 32)	1
  (19, 5)	1
  (19, 11)	1
  (20, 8)	1
  (21, 32)	1
  (22, 31)	1
  (23, 32)	1
  (24, 25)	1
  (25, 32)	1
  (26, 32)	1
  (27, 31)	1
  (28, 32)	1
  (29, 32)	1
  (30, 32)	1
  (31, 32)	1
  (32, 32)	1
  (33, 32)	1
  (34, 32)	1
  (35, 8)	1
  (36, 32)	1
  (37, 32)	1
  (38, 5)	1
  (38, 30)	1
  (39, 9)	1
  (40, 32)	1
  (41, 32)	1
  (42, 12)	1
  (43, 5)	1
  (43, 11)	1
  (44, 32)	1
  (45, 32)	1
  (46, 32)	1
  (47, 32)	1
  (48, 8)	1
  (49, 32)	1
  (50, 32)	1
  (51, 32)	1
  (52, 32)	1
  (53, 5)	1
  (54, 32)	1
  (55, 27)	1
  (56, 5)	1
  (56, 11)	1
  (57, 32)	1
  (58, 32)	1
  (59, 32)	1
  (60, 9)	1
  (61, 29)	1
  (62, 32)	1
  (63, 32)	1
  (64, 32)	1
  (65, 0)	1
  (66, 27)	1
  (67, 5)	1
  (67, 11)	1
  (68, 32)	1
  (69, 8)	1
  (70, 12)	1
  (71, 32)	1
  

In [24]:
dense_matrix = predictions4.todense()

# Creating a pandas DataFrame from the dense matrix
final_pred = pd.DataFrame(dense_matrix)
display(final_pred)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1996,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
final_pred['id'] = 1698001 + final_pred.index
final_pred = final_pred[['id'] + [col for col in final_pred.columns if col != 'id']]


In [26]:

# Extracting the 'id' column
id_col = final_pred['id']

# Creating an empty DataFrame to store the transformed data
transformed_df = pd.DataFrame(columns=['id_label', 'pred'])

# Iterating over each row in the original DataFrame
for index, row in final_pred.iterrows():
    # Extracting the 'id' value for the current row
    current_id = row['id']
    
    # Iterating over the remaining columns (y1, y2, y3, etc.)
    for col in final_pred.columns[1:]:
        # Generating the new 'col1' value by concatenating id and column name
        col1_value = f"{current_id}_y{col+1}"
        
        # Getting the 'col2' value from the original DataFrame
        col2_value = row[col]
        
        # Appending the new values to the transformed DataFrame
        transformed_df = pd.concat([transformed_df, pd.DataFrame({'id_label': [col1_value], 'pred': [col2_value]})], ignore_index=True)

# Displaying the transformed DataFrame
print(transformed_df)


          id_label pred
0       1698001_y1    0
1       1698001_y2    0
2       1698001_y3    0
3       1698001_y4    0
4       1698001_y5    0
...            ...  ...
65995  1700000_y29    0
65996  1700000_y30    0
65997  1700000_y31    0
65998  1700000_y32    0
65999  1700000_y33    1

[66000 rows x 2 columns]


In [27]:
#Final Output file generated
transformed_df.to_csv('NitheeshP-prediction-output.csv',index=False)


