# Data preprocessing

In [4]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm

def load_data(gt_path, data_path):
 with open(data_path) as f:
  data = f.read().splitlines()

 with open(gt_path, "r") as f:
  labels = f.read().splitlines()

 df = pd.DataFrame({"text": data, "label": labels})
 df = df[df["text"] != ";;;"]
 df["text"] = df["text"].apply(lambda x: x.replace(";;;", ""))
 df = df[~(df["label"].str.strip()=="")]
 df = df[~df["label"].str.contains(";")]

 df["label"] = df["label"].str.strip()

 df["label"] = np.where(df["label"] == "O O", "O", df["label"])
 return df

train = load_data("train_gt.csv", "train_data.csv")
valid = load_data("valid_gt.csv", "valid_data.csv")
label_mapping = {'O': 0, 'B-ORG': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}
train['label'] = train['label'].map(label_mapping)
valid['label'] = valid['label'].map(label_mapping)
ner_pos = preprocessing.LabelEncoder()
def rows_to_sentences_and_labels(df):
    sentences = []
    sentences_labels = []
    current_sentence = []
    current_labels = []

    for index, row in tqdm(df.iterrows(), total = len(df)):
        word, label = row['text'], row['label']
        current_sentence.append(word.strip())
        current_labels.append(label)
        if word.strip() == '.':
            sentences.append(current_sentence)
            sentences_labels.append(current_labels)
            current_sentence = []
            current_labels = []

    return sentences, sentences_labels

train_sentences, train_sentences_labels = rows_to_sentences_and_labels(train)
valid_sentences, valid_sentences_labels = rows_to_sentences_and_labels(valid)

train_sentences_str = [item for sublist in train_sentences for item in sublist]
valid_sentences_str = [item for sublist in valid_sentences for item in sublist]
train_sentences_labels_str = [item for sublist in train_sentences_labels for item in sublist]
valid_sentences_labels_str = [item for sublist in valid_sentences for item in sublist]


100%|██████████| 202386/202386 [00:17<00:00, 11883.49it/s]
100%|██████████| 50937/50937 [00:02<00:00, 20764.11it/s]


---

# Training

 MultinomialNB model without tokenize

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression


# Split the data into training, development, and test sets
X_train, X_dev, y_train, y_dev = train_test_split(train_sentences_str, train_sentences_labels_str, test_size=0.20)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, test_size=0.50)

print(len(X_train), len(X_dev), len(X_test))
print(X_train[:10])
# Vectorize the data using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_dev_features = vectorizer.transform(X_dev)
X_test_features = vectorizer.transform(X_test)

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform([str(label) for label in y_train])
y_dev_encoded = label_encoder.transform([str(label) for label in y_dev])
y_test_encoded = label_encoder.transform([str(label) for label in y_test])

# Train Multinomial Naive Bayes model
multinomial_naive_bayes = MultinomialNB(alpha=0.001)
multinomial_naive_bayes.fit(X_train_features, y_train_encoded)

# Evaluate on the development set
dev_predictions = multinomial_naive_bayes.predict(X_dev_features)
accuracy_dev = (dev_predictions == y_dev_encoded).mean()
print(f"Development Set Accuracy: {accuracy_dev}")

# Evaluate on the test set
test_predictions = multinomial_naive_bayes.predict(X_test_features)
accuracy_test = (test_predictions == y_test_encoded).mean()
print(f"Test Set Accuracy: {accuracy_test}")

y_dev_predictions = multinomial_naive_bayes.predict(X_dev_features)

print(classification_report(y_pred=y_dev_predictions, y_true=y_dev, digits=4, zero_division=1))

161891 20236 20237
['Party', 'negotiations', 'on', '.', 'for', 'Previous', 'approves', 'on', 'runs', 'the']
Development Set Accuracy: 0.9348685510970548
Test Set Accuracy: 0.9358106438701389
              precision    recall  f1-score   support

           0     0.9558    0.9955    0.9752     16822
           1     0.8477    0.7021    0.7681       658
           2     0.8552    0.7066    0.7738       351
           3     0.7778    0.6865    0.7293       673
           4     0.6960    0.4460    0.5436       426
           5     0.8406    0.7753    0.8066       721
           6     0.7406    0.4220    0.5377       372
           7     0.5806    0.3186    0.4114       113
           8     0.7733    0.5800    0.6629       100

    accuracy                         0.9349     20236
   macro avg     0.7853    0.6258    0.6898     20236
weighted avg     0.9281    0.9349    0.9290     20236



GridSearchCV model (has not finish)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Define the pipeline with TF-IDF vectorizer and Logistic Regression classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
])

# Define the parameters for grid search
parameters = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters found by grid search
print("Best parameters:")
print(grid_search.best_params_)

# Evaluate on the development set
dev_predictions = grid_search.predict(X_dev)
accuracy_dev = (dev_predictions == y_dev).mean()
print(f"Development Set Accuracy: {accuracy_dev}")

# Evaluate on the test set
test_predictions = grid_search.predict(X_test)
accuracy_test = (test_predictions == y_test).mean()
print(f"Test Set Accuracy: {accuracy_test}")

# Print classification report
print(classification_report(y_pred=test_predictions, y_true=y_test, digits=4, zero_division=1))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters:
{'clf__C': 10, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 1)}
Development Set Accuracy: 0.9240956710812414
Test Set Accuracy: 0.9255324405791372
              precision    recall  f1-score   support

           0     0.9430    0.9946    0.9681     16918
           1     0.8248    0.6176    0.7063       625
           2     0.8459    0.7108    0.7725       332
           3     0.8051    0.6230    0.7024       610
           4     0.6947    0.3420    0.4584       459
           5     0.8558    0.7479    0.7982       722
           6     0.6667    0.3427    0.4527       356
           7     0.5556    0.3431    0.4242       102
           8     0.6761    0.4248    0.5217       113

    accuracy                         0.9255     20237
   macro avg     0.7631    0.5718    0.6450     20237
weighted avg     0.9165    0.9255    0.9168     20237



MultinomialNB with tokenize

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tagged_tokens = pos_tag(tokens)
    tokens = [lemmatizer.lemmatize(token.lower(), get_wordnet_pos(tag))
              for token, tag in tagged_tokens if token not in string.punctuation and token.lower() not in stopwords.words('english')]
    return tokens

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'  # adjective
    elif tag.startswith('V'):
        return 'v'  # verb
    elif tag.startswith('N'):
        return 'n'  # noun
    elif tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun if POS tag is unknown



# Split the data into training, development, and test sets
X_train, X_dev, y_train, y_dev = train_test_split(train_sentences_str, train_sentences_labels_str, test_size=0.20)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, test_size=0.50)

print(len(X_train), len(X_dev), len(X_test))
print(X_train[:10])
# Vectorize the data using TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2))

X_train_features = vectorizer.fit_transform(X_train)
X_dev_features = vectorizer.transform(X_dev)
X_test_features = vectorizer.transform(X_test)

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform([str(label) for label in y_train])
y_dev_encoded = label_encoder.transform([str(label) for label in y_dev])
y_test_encoded = label_encoder.transform([str(label) for label in y_test])

# Train Multinomial Naive Bayes model
multinomial_naive_bayes = MultinomialNB(alpha=0.001)
multinomial_naive_bayes.fit(X_train_features, y_train_encoded)

# Evaluate on the development set
dev_predictions = multinomial_naive_bayes.predict(X_dev_features)
accuracy_dev = (dev_predictions == y_dev_encoded).mean()
print(f"Development Set Accuracy: {accuracy_dev}")

# Evaluate on the test set
test_predictions = multinomial_naive_bayes.predict(X_test_features)
accuracy_test = (test_predictions == y_test_encoded).mean()
print(f"Test Set Accuracy: {accuracy_test}")

y_dev_predictions = multinomial_naive_bayes.predict(X_dev_features)

print(classification_report(y_pred=y_dev_predictions, y_true=y_dev, digits=4, zero_division=1))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


161891 20236 20237
['28', 'North', '95', '936.000', 'the', 'shut', 'Tuesday', 'conditions', 'squad', 'political']




Development Set Accuracy: 0.9358568887131844
Test Set Accuracy: 0.9310174432969314
              precision    recall  f1-score   support

           0     0.9518    0.9975    0.9741     16787
           1     0.8568    0.6748    0.7550       612
           2     0.8758    0.7112    0.7850       367
           3     0.8049    0.6960    0.7465       658
           4     0.7105    0.4789    0.5722       451
           5     0.8866    0.8177    0.8508       746
           6     0.7977    0.3740    0.5092       369
           7     0.6957    0.2500    0.3678       128
           8     0.7831    0.5508    0.6468       118

    accuracy                         0.9359     20236
   macro avg     0.8181    0.6168    0.6897     20236
weighted avg     0.9296    0.9359    0.9289     20236



---
# Prediction

In [8]:
# import numpy as np
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def save_file(data, predict, file_name):

    # Define the label_mapping_reverse dictionary
    label_mapping_reverse = {0: 'O', 1: 'B-ORG', 2: 'B-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-LOC', 6: 'I-ORG', 7: 'I-MISC', 8: 'I-LOC'}

    # Map elements of nb_predicted_labels using label_mapping_reverse
    mapped_labels = [label_mapping_reverse[int(label)] for label in predict]

    # Combine the text and mapped_labels into a list of strings
    combined_data = []
    for text, label in zip(data["text"], mapped_labels):
        # Check if the text is not empty or just a space
        if text.strip():
            combined_data.append(f"{text.strip()} {label}\n")
        else:
            # If the text is empty or just a space, exclude the label
            combined_data.append(f"  {label}\n")

    # Write the combined data to a CSV file
    with open(file_name, "w") as f:
        f.writelines(combined_data)
# Load and preprocess the new data
def preprocess_new_data(data):
    # Create a DataFrame with the new data
    df = pd.DataFrame({"text": data})
    # Preprocess the text (if needed)
    df["text"] = df["text"].apply(lambda x: x.strip())
    return df

# Load the new data
def load_new_data(data_path):
    with open(data_path, "r") as file:
        data = file.readlines()
    return data

# Load and preprocess the new data
new_data = load_new_data("test_data.csv")
new_data_df = preprocess_new_data(new_data)

# Vectorize the new data using the fitted vectorizer
X_new_features = vectorizer.transform(new_data_df["text"])

# Predict using Multinomial Naive Bayes
nb_predictions = multinomial_naive_bayes.predict(X_new_features)
print(nb_predictions)
nb_predicted_labels = label_encoder.inverse_transform(nb_predictions)

print("Predictions using Multinomial Naive Bayes:")
print(nb_predicted_labels)

# Save predictions to a file
save_file(new_data_df, nb_predicted_labels, 'predict_nb.csv')

[0 0 0 ... 0 0 0]
Predictions using Multinomial Naive Bayes:
['0' '0' '0' ... '0' '0' '0']
