In [1]:
# Import the necessary libraries
import re
import csv
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load data from CSV file
df = pd.read_csv('train_set.csv')
test = pd.read_csv('new_test.csv')

In [3]:
# Copy the main dataframe to 'train'
train = df

# Remove numbers from the rows in the 'Text' column of 'train'
train['Text'] = train['Text'].apply(lambda x: re.sub(r'\d+', '', x))

# Remove numbers from the rows in the 'Text' column of 'test'
test['Text'] = test['Text'].apply(lambda x: re.sub(r'\d+', '', x))

In [4]:
# Remove punctuation and convert text to lowercase in the 'Text' column of 'train'
train['Text'] = df['Text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())

# Remove punctuation and convert text to lowercase in the 'Text' column of 'test'
test['Text'] = test['Text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())

In [5]:
# Define a list of English stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

# Remove stopwords from the 'Text' column in 'train'
train['Text'] = train['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Remove stopwords from the 'Text' column in 'test'
test['Text'] = test['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Create a TF-IDF vectorizer to convert the text of the law into a matrix of TF-IDF features
tfidf = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the TF-IDF vectorizer on the 'Text' column in 'train'
train_tfidf = tfidf.fit_transform(train['Text'])

# Transform the 'Text' column in 'test' into TF-IDF features without refitting the vectorizer
test_tfidf = tfidf.transform(test['Text'])

In [7]:
# Train the model with the train dataset using a Neural Network (MLPClassifier)
nn = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=10, random_state=42, verbose=True)

# Fit the Neural Network model on the TF-IDF features from the train dataset and predict 'Directory code'
nn.fit(train_tfidf, train['Directory code'])

Iteration 1, loss = 0.51449038
Iteration 2, loss = 0.02006540
Iteration 3, loss = 0.01106236
Iteration 4, loss = 0.00771310
Iteration 5, loss = 0.00616946
Iteration 6, loss = 0.00559560
Iteration 7, loss = 0.00552733
Iteration 8, loss = 0.00412983
Iteration 9, loss = 0.00448420
Iteration 10, loss = 0.00553465




In [8]:
# Use the trained Neural Network model to predict the test dataset
y_pred = nn.predict(test_tfidf)

In [9]:
# Save the predictions to a new CSV file with one column named 'predictions'
with open('predictions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header row with the column name 'label'
    writer.writerow(["label"])
    
    # Write each prediction value from 'y_pred' into the 'label' column
    for i in range(len(y_pred)):
        writer.writerow([y_pred[i]])