In [1]:
# Import the necessary libraries
import pandas as pd
import re
import nltk 
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load data from CSV file
train = pd.read_csv('dataset_challenge_DAY3/train_set.csv')
test = pd.read_csv('dataset_challenge_DAY3/new_test.csv')

In [3]:
# Create a variable 'y_train' containing the target labels by dropping specified columns from 'train'
y_train = train.drop(['CELEX_ID', 'Text', 'Citations'], axis=1)

In [4]:
# Define a text cleaning function that extracts a substring from the input text
def take_some_text(text):
    text = text[149:8000]  # Extract a substring from index 149 to 7999
    return text

In [5]:
# Define a function to remove stopwords and punctuation from a text column in a dataframe
def remove_stopwords_and_punkt(df, text_column="text", legal_stopwords=False):
    # This function takes as input a dataframe, a string containing the text column name, and an optional list of legal words.
    # The function returns a dataframe with the specified text column cleaned from stopwords and punctuation.
    nltk.download('stopwords')  # Download the NLTK stopwords dataset
    nltk.download('punkt')      # Download the NLTK punctuation dataset
    
    stop_words = set(stopwords.words('english'))  # Create a set of English stopwords
    
    if not legal_stopwords == False:
        stop_words = stop_words.union(legal_stopwords)  # If legal stopwords are provided, add them to the set
    
    def remove_stop_words(text):
        words = word_tokenize(text.lower())          # Tokenize the text and convert to lowercase
        clean_words = [word for word in words if word not in stop_words]  # Remove stopwords
        
        return " ".join(clean_words)  # Join the clean words back into a string
    
    df[text_column] = df[text_column].apply(remove_stop_words)  # Apply the remove_stop_words function to the text column

    return df

In [6]:
# Define a function to remove punctuation and convert text to lowercase in a dataframe column
def remove_punct(df):
    # Apply lambda function to remove punctuation and convert text to lowercase in the 'Text' column
    df['Text'] = df['Text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())
    
    return df['Text']  # Return the cleaned 'Text' column

In [7]:
# Define a function to remove digits from text in a dataframe column
def remove_digit(df):
    # Apply regex to remove digits from the 'Text' column
    df['Text'] = df['Text'].apply(lambda x: re.sub(r'\d+', '', x))
    
    return df['Text']  # Return the text column with digits removed

In [8]:
# Define a function to remove specified symbols from text in a dataframe column
def remove_symbols(df):
    symbols_to_remove = ['/', '$', '@', '\\', '\.+']  # List of symbols to remove
    pattern = '|'.join(re.escape(symbol) for symbol in symbols_to_remove)  # Create a regex pattern
    
    # Apply regex to remove specified symbols from the 'Text' column
    df['Text'] = df['Text'].apply(lambda x: re.sub(pattern, '', x))  
    
    return df['Text']  # Return the text column with specified symbols removed

In [9]:
# Apply various text cleaning functions to the 'Text' column in the train set
X_train = remove_stopwords_and_punkt(train, text_column="Text")  # Remove stopwords and punctuation
X_train['Text'] = train['Text'].apply(take_some_text)  # Extract a substring
X_train['Text'] = remove_digit(train)  # Remove digits
X_train['Text'] = remove_symbols(train)  # Remove specified symbols
X_train['Text'] = remove_punct(train)  # Remove punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonemaiorani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonemaiorani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# Apply various text cleaning functions to the 'Text' column in the test set
test = remove_stopwords_and_punkt(test, text_column="Text")  # Remove stopwords and punctuation
test['Text'] = test['Text'].apply(take_some_text)  # Extract a substring
test['Text'] = remove_digit(test)  # Remove digits
test['Text'] = remove_symbols(test)  # Remove specified symbols
test['Text'] = remove_punct(test)  # Remove punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonemaiorani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonemaiorani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Create a TF-IDF vectorizer for text
tfidf = TfidfVectorizer()

# Fit and transform the TF-IDF vectorizer on the 'Text' column in the train dataset
X_train_idf_final = tfidf.fit_transform(train['Text'])

# Transform the 'Text' column in the test dataset using the pre-fitted vectorizer
X_test_idf_final = tfidf.transform(test['Text'])

In [12]:
# Define the MLP classifier with specified parameters
mlp = MLPClassifier(hidden_layer_sizes=(120, 120), max_iter=25, activation='tanh', solver='adam', random_state=42, learning_rate='adaptive', verbose=True)

# Create a MultiOutputClassifier to handle multi-output classification
classifier = MultiOutputClassifier(mlp)

# Fit the MultiOutputClassifier using the TF-IDF matrix for text and 'y_train'
classifier.fit(X_train_idf_final, y_train)

Iteration 1, loss = 0.07462042
Iteration 2, loss = 0.00422734
Iteration 3, loss = 0.00332522
Iteration 4, loss = 0.00217096
Iteration 5, loss = 0.00118655
Iteration 6, loss = 0.00067113
Iteration 7, loss = 0.00041366
Iteration 8, loss = 0.00034698
Iteration 9, loss = 0.00030360
Iteration 10, loss = 0.00026066
Iteration 11, loss = 0.00024162
Iteration 12, loss = 0.00022239
Iteration 13, loss = 0.00020702
Iteration 14, loss = 0.00019510
Iteration 15, loss = 0.00018482
Iteration 16, loss = 0.00017700
Iteration 17, loss = 0.00016932
Iteration 18, loss = 0.00016290
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.07836418
Iteration 2, loss = 0.01178027
Iteration 3, loss = 0.00842916
Iteration 4, loss = 0.00505814
Iteration 5, loss = 0.00249807
Iteration 6, loss = 0.00125523
Iteration 7, loss = 0.00080760
Iteration 8, loss = 0.00057395
Iteration 9, loss = 0.00046266
Iteration 10, loss = 0.00039478
Iteration 11, loss = 0.00033442




Iteration 1, loss = 0.07827722
Iteration 2, loss = 0.00758538
Iteration 3, loss = 0.00499029
Iteration 4, loss = 0.00298741
Iteration 5, loss = 0.00153678
Iteration 6, loss = 0.00084315
Iteration 7, loss = 0.00066982
Iteration 8, loss = 0.00055463
Iteration 9, loss = 0.00045796
Iteration 10, loss = 0.00039316
Iteration 11, loss = 0.00036244
Iteration 12, loss = 0.00032432
Iteration 13, loss = 0.00029204
Iteration 14, loss = 0.00026756
Iteration 15, loss = 0.00025615
Iteration 16, loss = 0.00023330
Iteration 17, loss = 0.00022080
Iteration 18, loss = 0.00020955
Iteration 19, loss = 0.00020012
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.14577478
Iteration 2, loss = 0.02198209
Iteration 3, loss = 0.01092803
Iteration 4, loss = 0.00665589
Iteration 5, loss = 0.00435896
Iteration 6, loss = 0.00364047
Iteration 7, loss = 0.00261347
Iteration 8, loss = 0.00204403
Iteration 9, loss = 0.00174423
Iteration 10, loss = 0.00134607




Iteration 1, loss = 0.08138537
Iteration 2, loss = 0.01211077
Iteration 3, loss = 0.00671715
Iteration 4, loss = 0.00306671
Iteration 5, loss = 0.00159923
Iteration 6, loss = 0.00101390
Iteration 7, loss = 0.00075401
Iteration 8, loss = 0.00058231
Iteration 9, loss = 0.00047755
Iteration 10, loss = 0.00040218
Iteration 11, loss = 0.00034642
Iteration 12, loss = 0.00031839
Iteration 13, loss = 0.00029142
Iteration 14, loss = 0.00026612
Iteration 15, loss = 0.00024947
Iteration 16, loss = 0.00023623
Iteration 17, loss = 0.00022315
Iteration 18, loss = 0.00021111
Iteration 19, loss = 0.00020321
Iteration 20, loss = 0.00019471
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.07090174
Iteration 2, loss = 0.00137167
Iteration 3, loss = 0.00125931
Iteration 4, loss = 0.00116776
Iteration 5, loss = 0.00086901
Iteration 6, loss = 0.00068211
Iteration 7, loss = 0.00053592
Iteration 8, loss = 0.00048037
Iteration 9, loss = 0.00034284




Iteration 1, loss = 0.12673528
Iteration 2, loss = 0.02371582
Iteration 3, loss = 0.01224742
Iteration 4, loss = 0.00660736
Iteration 5, loss = 0.00366467
Iteration 6, loss = 0.00270092
Iteration 7, loss = 0.00234690
Iteration 8, loss = 0.00211080
Iteration 9, loss = 0.00142585
Iteration 10, loss = 0.00184013
Iteration 11, loss = 0.00120782
Iteration 12, loss = 0.00103173
Iteration 13, loss = 0.00088789
Iteration 14, loss = 0.00076540
Iteration 15, loss = 0.00060210
Iteration 16, loss = 0.00065061
Iteration 17, loss = 0.00074783
Iteration 18, loss = 0.00071078
Iteration 19, loss = 0.00051797
Iteration 20, loss = 0.00059128
Iteration 21, loss = 0.00047860
Iteration 22, loss = 0.00053501
Iteration 23, loss = 0.00062918
Iteration 24, loss = 0.00119690
Iteration 25, loss = 0.00074416




Iteration 1, loss = 0.07398774
Iteration 2, loss = 0.00452175
Iteration 3, loss = 0.00247169
Iteration 4, loss = 0.00149165
Iteration 5, loss = 0.00108675
Iteration 6, loss = 0.00072295
Iteration 7, loss = 0.00054639
Iteration 8, loss = 0.00046421
Iteration 9, loss = 0.00040213
Iteration 10, loss = 0.00035707
Iteration 11, loss = 0.00034132
Iteration 12, loss = 0.00027789
Iteration 13, loss = 0.00026458
Iteration 14, loss = 0.00023836
Iteration 15, loss = 0.00022575
Iteration 16, loss = 0.00020699
Iteration 17, loss = 0.00019461
Iteration 18, loss = 0.00018921
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.09395459
Iteration 2, loss = 0.01441759
Iteration 3, loss = 0.00873307
Iteration 4, loss = 0.00576530
Iteration 5, loss = 0.00392663
Iteration 6, loss = 0.00281241
Iteration 7, loss = 0.00248422
Iteration 8, loss = 0.00181564
Iteration 9, loss = 0.00141497
Iteration 10, loss = 0.00121613
Iteration 11, loss = 0.00117860




Iteration 1, loss = 0.07188325
Iteration 2, loss = 0.00417383
Iteration 3, loss = 0.00206470
Iteration 4, loss = 0.00064707
Iteration 5, loss = 0.00038440
Iteration 6, loss = 0.00031733
Iteration 7, loss = 0.00027570
Iteration 8, loss = 0.00024590
Iteration 9, loss = 0.00022400
Iteration 10, loss = 0.00020698
Iteration 11, loss = 0.00019325
Iteration 12, loss = 0.00018196
Iteration 13, loss = 0.00017241
Iteration 14, loss = 0.00016416
Iteration 15, loss = 0.00015700
Iteration 16, loss = 0.00015064
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.15805250
Iteration 2, loss = 0.03620073
Iteration 3, loss = 0.01508203
Iteration 4, loss = 0.00803685
Iteration 5, loss = 0.00487326
Iteration 6, loss = 0.00312931
Iteration 7, loss = 0.00204960
Iteration 8, loss = 0.00145861
Iteration 9, loss = 0.00113354
Iteration 10, loss = 0.00089670
Iteration 11, loss = 0.00080354
Iteration 12, loss = 0.00085010
Iteration 13, loss = 0.00115957




Iteration 1, loss = 0.07891549
Iteration 2, loss = 0.00791058
Iteration 3, loss = 0.00369762
Iteration 4, loss = 0.00245571
Iteration 5, loss = 0.00138663
Iteration 6, loss = 0.00090068
Iteration 7, loss = 0.00048160
Iteration 8, loss = 0.00038672
Iteration 9, loss = 0.00033138
Iteration 10, loss = 0.00029223
Iteration 11, loss = 0.00026543
Iteration 12, loss = 0.00024418
Iteration 13, loss = 0.00022825
Iteration 14, loss = 0.00021490
Iteration 15, loss = 0.00020382
Iteration 16, loss = 0.00019424
Iteration 17, loss = 0.00018599
Iteration 18, loss = 0.00017858
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.10632144
Iteration 2, loss = 0.01646394
Iteration 3, loss = 0.00688949
Iteration 4, loss = 0.00393294
Iteration 5, loss = 0.00245563
Iteration 6, loss = 0.00166158
Iteration 7, loss = 0.00129278
Iteration 8, loss = 0.00094730
Iteration 9, loss = 0.00073858
Iteration 10, loss = 0.00075123
Iteration 11, loss = 0.00054323




Iteration 1, loss = 0.07348541
Iteration 2, loss = 0.00283901
Iteration 3, loss = 0.00144248
Iteration 4, loss = 0.00055869
Iteration 5, loss = 0.00038388
Iteration 6, loss = 0.00031859
Iteration 7, loss = 0.00027628
Iteration 8, loss = 0.00024680
Iteration 9, loss = 0.00022503
Iteration 10, loss = 0.00020824
Iteration 11, loss = 0.00019443
Iteration 12, loss = 0.00018299
Iteration 13, loss = 0.00017346
Iteration 14, loss = 0.00016517
Iteration 15, loss = 0.00015797
Iteration 16, loss = 0.00015161
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.08111346
Iteration 2, loss = 0.00766296
Iteration 3, loss = 0.00313339
Iteration 4, loss = 0.00172780
Iteration 5, loss = 0.00098600
Iteration 6, loss = 0.00067472
Iteration 7, loss = 0.00054889
Iteration 8, loss = 0.00048093
Iteration 9, loss = 0.00041027
Iteration 10, loss = 0.00036306
Iteration 11, loss = 0.00040324
Iteration 12, loss = 0.00037522
Iteration 13, loss = 0.00033174




Iteration 1, loss = 0.09102377
Iteration 2, loss = 0.01097219
Iteration 3, loss = 0.00653266
Iteration 4, loss = 0.00460379
Iteration 5, loss = 0.00364773
Iteration 6, loss = 0.00304991
Iteration 7, loss = 0.00262802
Iteration 8, loss = 0.00210619
Iteration 9, loss = 0.00205934
Iteration 10, loss = 0.00184336
Iteration 11, loss = 0.00164142
Iteration 12, loss = 0.00140837
Iteration 13, loss = 0.00134790
Iteration 14, loss = 0.00107886
Iteration 15, loss = 0.00111726
Iteration 16, loss = 0.00090914
Iteration 17, loss = 0.00078457
Iteration 18, loss = 0.00064128
Iteration 19, loss = 0.00067166
Iteration 20, loss = 0.00073273
Iteration 21, loss = 0.00054818
Iteration 22, loss = 0.00057055
Iteration 23, loss = 0.00049630
Iteration 24, loss = 0.00051283
Iteration 25, loss = 0.00039280




Iteration 1, loss = 0.07085274
Iteration 2, loss = 0.00145801
Iteration 3, loss = 0.00128334
Iteration 4, loss = 0.00106358
Iteration 5, loss = 0.00073854
Iteration 6, loss = 0.00036859
Iteration 7, loss = 0.00028218
Iteration 8, loss = 0.00024696
Iteration 9, loss = 0.00022362
Iteration 10, loss = 0.00020628
Iteration 11, loss = 0.00019249
Iteration 12, loss = 0.00018120
Iteration 13, loss = 0.00017174
Iteration 14, loss = 0.00016356
Iteration 15, loss = 0.00015648
Iteration 16, loss = 0.00015020
Iteration 17, loss = 0.00014461
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.07536961
Iteration 2, loss = 0.00560738
Iteration 3, loss = 0.00341936
Iteration 4, loss = 0.00181942
Iteration 5, loss = 0.00096126
Iteration 6, loss = 0.00053302
Iteration 7, loss = 0.00039418
Iteration 8, loss = 0.00032662
Iteration 9, loss = 0.00028526
Iteration 10, loss = 0.00025604
Iteration 11, loss = 0.00023471
Iteration 12, loss = 0.00021887




Iteration 1, loss = 0.07414806
Iteration 2, loss = 0.00413896
Iteration 3, loss = 0.00266879
Iteration 4, loss = 0.00133633
Iteration 5, loss = 0.00059707
Iteration 6, loss = 0.00044180
Iteration 7, loss = 0.00034371
Iteration 8, loss = 0.00030847
Iteration 9, loss = 0.00027819
Iteration 10, loss = 0.00024110
Iteration 11, loss = 0.00022556
Iteration 12, loss = 0.00020869
Iteration 13, loss = 0.00019526
Iteration 14, loss = 0.00018451
Iteration 15, loss = 0.00017532
Iteration 16, loss = 0.00016834
Iteration 17, loss = 0.00016120
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.12103519
Iteration 2, loss = 0.03625574
Iteration 3, loss = 0.01273591
Iteration 4, loss = 0.00603161
Iteration 5, loss = 0.00348386
Iteration 6, loss = 0.00231529
Iteration 7, loss = 0.00170308
Iteration 8, loss = 0.00137648
Iteration 9, loss = 0.00114382
Iteration 10, loss = 0.00083520
Iteration 11, loss = 0.00073493
Iteration 12, loss = 0.00063909




Iteration 1, loss = 0.08158331
Iteration 2, loss = 0.01264966
Iteration 3, loss = 0.00761395
Iteration 4, loss = 0.00384425
Iteration 5, loss = 0.00168470
Iteration 6, loss = 0.00098870
Iteration 7, loss = 0.00072080
Iteration 8, loss = 0.00056948
Iteration 9, loss = 0.00046797
Iteration 10, loss = 0.00040817
Iteration 11, loss = 0.00035866
Iteration 12, loss = 0.00033060
Iteration 13, loss = 0.00029657
Iteration 14, loss = 0.00027883
Iteration 15, loss = 0.00026105
Iteration 16, loss = 0.00024703
Iteration 17, loss = 0.00022890
Iteration 18, loss = 0.00022152
Iteration 19, loss = 0.00021140
Iteration 20, loss = 0.00020110
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.07273031
Iteration 2, loss = 0.00339290
Iteration 3, loss = 0.00271003
Iteration 4, loss = 0.00179358
Iteration 5, loss = 0.00110392
Iteration 6, loss = 0.00066577
Iteration 7, loss = 0.00046275
Iteration 8, loss = 0.00037353
Iteration 9, loss = 0.00031279


In [13]:
# Make predictions on the test set using the trained multi-output classifier
y_pred = classifier.predict(X_test_idf_final)

In [14]:
# Create a pandas DataFrame with predicted values and column names
column_names = y_train.columns
pd_pred = pd.DataFrame(y_pred, columns=column_names)

# Specify the file path where you want to save the CSV
file_path = "day3.csv"

# Convert the DataFrame to CSV and save the CSV file
pd_pred.to_csv(file_path, index=False, header=True)