In [None]:
# Importing the necessary python libraries and modules required to build the model(s)
import pandas as pd
import hashlib
import numpy as np
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

from sklearn.model_selection import train_test_split


In [None]:
# Function to convert hashkey string values present in the data into numerical values 

def hash_to_number(hash_key):
    return hash(hash_key)

In [None]:
# Random Forest Classifier 
RFclassifier = LabelPowerset(
    classifier = RandomForestClassifier(),
    require_dense = [False,True]
)

In [None]:
# Decision Tree Classifier
DTclassifier = LabelPowerset(
    classifier = DecisionTreeClassifier(),
    require_dense = [False,True]
)

In [None]:
# Simple Neural Network
NNclassifier = MLPClassifier(max_iter=400)



In [None]:
#importing the train,test and training labels using pandas
df1 = pd.read_csv("train.csv")


In [None]:
df2 = pd.read_csv("test.csv",header = None)

In [None]:
df3 = pd.read_csv("trainLabels.csv")


In [None]:
# merging train and training label 
merged_df = pd.merge(df1, df3, on='id')

In [None]:

# replacing columns with YES/NO data with 1/0 and empty values with -1
merged_df.replace({'YES':1,'NO':0,np.nan:-1},inplace=True)
    
object_columns = merged_df.select_dtypes(include='object').columns

# applying the hash function to the rows containing hash strings
for column in object_columns:
    merged_df[column] = merged_df[column].apply(hash_to_number)
    
X = merged_df
y = merged_df
    
for column in merged_df.columns:
    if column.startswith('y'):
        X = X.drop(columns=column)
    elif column.startswith('x'):
        y = y.drop(columns=column)
X = X.drop(columns='id')
y = y.drop(columns='id')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)


In [None]:
RFclassifier.fit(np.array(X_train), np.array(y_train))

In [None]:
predictions1 = RFclassifier.predict(X_test)

accuracy_score(y_test,predictions1)


In [None]:
hamming_loss(y_test, predictions1)

In [None]:
NNclassifier.fit(np.array(X_train), np.array(y_train))

In [None]:
predictions2 = NNclassifier.predict(X_test)

In [None]:
accuracy_score(y_test,predictions2)

In [None]:
DTclassifier.fit(np.array(X_train), np.array(y_train))

In [None]:
predictions3 = DTclassifier.predict(X_test)

accuracy_score(y_test,predictions3)

In [None]:
# applying preprocessing for test data
merged_df2 = df2

merged_df2.replace({'YES':1,'NO':0,np.nan:-1},inplace=True)
    
object_columns = merged_df2.select_dtypes(include='object').columns

merged_df2 = merged_df2.drop(columns=0)


for column in object_columns:
    merged_df2[column] = merged_df2[column].apply(hash_to_number)


In [None]:
# Accuracy of Random Forest Classifier ~82%, Accuracy of Neural Network ~25% (with high variance),Accuracy of Decision Tree Classifier ~72%
# Hence Random Forest Classifier will be used with the entire training data to make prediction for the test data

In [None]:
RFclassifier.fit(np.array(X), np.array(y))

In [None]:
predictions4 = RFclassifier.predict(merged_df2)

print(predictions4)

In [None]:
dense_matrix = predictions4.todense()

# Creating a pandas DataFrame from the dense matrix
final_pred = pd.DataFrame(dense_matrix)
display(final_pred)

In [None]:
final_pred['id'] = 1698001 + final_pred.index
final_pred = final_pred[['id'] + [col for col in final_pred.columns if col != 'id']]


In [None]:

# Extracting the 'id' column
id_col = final_pred['id']

# Creating an empty DataFrame to store the transformed data
transformed_df = pd.DataFrame(columns=['id_label', 'pred'])

# Iterating over each row in the original DataFrame
for index, row in final_pred.iterrows():
    # Extracting the 'id' value for the current row
    current_id = row['id']
    
    # Iterating over the remaining columns (y1, y2, y3, etc.)
    for col in final_pred.columns[1:]:
        # Generating the new 'col1' value by concatenating id and column name
        col1_value = f"{current_id}_y{col+1}"
        
        # Getting the 'col2' value from the original DataFrame
        col2_value = row[col]
        
        # Appending the new values to the transformed DataFrame
        transformed_df = pd.concat([transformed_df, pd.DataFrame({'id_label': [col1_value], 'pred': [col2_value]})], ignore_index=True)

# Displaying the transformed DataFrame
print(transformed_df)


In [None]:
#Final Output file generated
transformed_df.to_csv('NitheeshP-prediction-output.csv',index=False)


