In [1]:
# Import the necessary libraries
import re
import nltk 
import string
import pandas as pd
from scipy.sparse import hstack
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load data from CSV file
train = pd.read_csv('dataset_challenge_DAY2/train_set.csv')
test = pd.read_csv('dataset_challenge_DAY2/new_test.csv')

In [3]:
# Create a variable 'y_train' containing the 'Directory code' column from the train dataset
y_train = train['Directory code']

In [4]:
# Define a text cleaning function that extracts a substring from the input text
def take_some_text(text):
    text = text[149:8000]  # Extract a substring from index 149 to 7999
    return text

In [5]:
# Define a function to remove stopwords and punctuation from a text column in a dataframe
def remove_stopwords_and_punkt(df, text_column="text", legal_stopwords=False):
    # This function takes as input a dataframe, a string containing the text column name, and an optional list of legal words
    # The function returns a dataframe with the specified text column cleaned from stopwords and punctuation
    nltk.download('stopwords')  # Download the NLTK stopwords dataset
    nltk.download('punkt')      # Download the NLTK punctuation dataset
    
    stop_words = set(stopwords.words('english'))  # Create a set of English stopwords
    
    if not legal_stopwords == False:
        stop_words = stop_words.union(legal_stopwords)  # If legal stopwords are provided, add them to the set
    
    def remove_stop_words(text):
        words = word_tokenize(text.lower())          # Tokenize the text and convert to lowercase
        clean_words = [word for word in words if word not in stop_words]  # Remove stopwords
        
        return " ".join(clean_words)  # Join the clean words back into a string
    
    df[text_column] = df[text_column].apply(remove_stop_words)  # Apply the remove_stop_words function to the text column

    return df

In [6]:
# Define a function to remove punctuation and convert text to lowercase in a dataframe column
def remove_punct(df):
    # Apply lambda function to remove punctuation and convert text to lowercase in the 'Text' column
    df['Text'] = df['Text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())
    
    return df['Text']  # Return the cleaned 'Text' column

In [7]:
# Define a function to remove digits from text in a dataframe column
def remove_digit(df):
    # Apply regex to remove digits from the 'Text' column
    df['Text'] = df['Text'].apply(lambda x: re.sub(r'\d+', '', x))
    
    return df['Text']  # Return the text column with digits removed

In [8]:
# Define a function to remove specified symbols from text in a dataframe column
def remove_symbols(df):
    symbols_to_remove = ['/', '$', '@', '\\', '\.+' ]  # List of symbols to remove
    pattern = '|'.join(re.escape(symbol) for symbol in symbols_to_remove)  # Create a regex pattern
    
    # Apply regex to remove specified symbols from the 'Text' column
    df['Text'] = df['Text'].apply(lambda x: re.sub(pattern, '', x))  
    
    return df['Text']  # Return the text column with specified symbols removed

In [9]:
# Apply various text cleaning functions to the 'Text' column in the train set
X_train = remove_stopwords_and_punkt(train, text_column="Text")  # Remove stopwords and punctuation
X_train['Text'] = train['Text'].apply(take_some_text)  # Extract a substring
X_train['Text'] = remove_digit(train)  # Remove digits
X_train['Text'] = remove_symbols(train)  # Remove specified symbols
X_train['Text'] = remove_punct(train)  # Remove punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonemaiorani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonemaiorani/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
# Apply various text cleaning functions to the 'Text' column in the test set
test = remove_stopwords_and_punkt(test, text_column="Text")  # Remove stopwords and punctuation
test['Text'] = test['Text'].apply(take_some_text)  # Extract a substring
test['Text'] = remove_digit(test)  # Remove digits
test['Text'] = remove_symbols(test)  # Remove specified symbols
test['Text'] = remove_punct(test)  # Remove punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simonemaiorani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simonemaiorani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Extract the 'Citations' columns from train and test
cit_train = train["Citations"]
cit_test = test["Citations"]

# Create a TF-IDF vectorizer for citations
tfidf_vectorizer_citazioni = TfidfVectorizer()

# Transform 'Citations' for train and test datasets
cit_tf_train = tfidf_vectorizer_citazioni.fit_transform(cit_train)
cit_tf_test = tfidf_vectorizer_citazioni.transform(cit_test)

# Create a TF-IDF vectorizer for text
vectorizer = TfidfVectorizer()
X_train_idf = vectorizer.fit_transform(X_train['Text'])

# Print the shape of the TF-IDF matrix for text
print(X_train_idf.shape)

# Combine TF-IDF matrices for text and citations for train and test
X_train_idf_final = hstack([X_train_idf, cit_tf_train])
X_test_idf = vectorizer.transform(test['Text'])
X_test_idf_final = hstack([X_test_idf, cit_tf_test])

(9941, 80310)


In [12]:
# Define the MLP classifier with specified parameters
mlp = MLPClassifier(hidden_layer_sizes=(150, 150), max_iter=150, activation='identity', solver='adam', random_state=42, learning_rate='adaptive', verbose=True)

# Fit the MLP model using the TF-IDF matrix for text and 'y_train'
mlp.fit(X_train_idf_final, y_train)

Iteration 1, loss = 2.56357047
Iteration 2, loss = 0.88582948
Iteration 3, loss = 0.45714378
Iteration 4, loss = 0.27514736
Iteration 5, loss = 0.17607049
Iteration 6, loss = 0.11367899
Iteration 7, loss = 0.07532807
Iteration 8, loss = 0.05174722
Iteration 9, loss = 0.03807115
Iteration 10, loss = 0.02897765
Iteration 11, loss = 0.02231242
Iteration 12, loss = 0.01808678
Iteration 13, loss = 0.01447403
Iteration 14, loss = 0.01286682
Iteration 15, loss = 0.01124758
Iteration 16, loss = 0.00942494
Iteration 17, loss = 0.00888397
Iteration 18, loss = 0.00770508
Iteration 19, loss = 0.00731101
Iteration 20, loss = 0.00657001
Iteration 21, loss = 0.00692997
Iteration 22, loss = 0.00628749
Iteration 23, loss = 0.00563029
Iteration 24, loss = 0.00522301
Iteration 25, loss = 0.00487423
Iteration 26, loss = 0.00518856
Iteration 27, loss = 0.00533468
Iteration 28, loss = 0.00491238
Iteration 29, loss = 0.00466680
Iteration 30, loss = 0.00429896
Iteration 31, loss = 0.00414201
Iteration 32, los

In [13]:
# Make predictions on the test set using the trained MLP model
y_pred = mlp.predict(X_test_idf_final)

In [14]:
# Save the predictions to a CSV file named 'predictions_NN.csv'
import csv

with open('predictions_NN.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header row with the column name 'label'
    writer.writerow(["label"])
    
    # Write each prediction value from 'y_pred' into the 'label' column
    for i in range(len(y_pred)):
        writer.writerow([y_pred[i]])