In [None]:
!pip install transformers



In [None]:
!pip install huggingface_hub



In [None]:
!pip install fasttext



In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-40GB


In [None]:
import glob
import pandas as pd
import csv
import re
import math
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoTokenizer, AutoModel, AdamW
import torch

import fasttext
from huggingface_hub import hf_hub_download
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
import matplotlib.pyplot as plt

import pandas as pd
import json

In [None]:
def transform_column(df_column):
  scaler = MinMaxScaler()
  scores = np.array(df_column)
  scores = scores.reshape(-1,1)
  scaler.fit(scores)
  scores = scaler.transform(scores)
  scores = np.float32(scores)
  return scores

In [None]:
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

def is_romanian_post(row):
  if model.predict(change_n_to_space(row["title"]))[0][0] == '__label__ron_Latn':
      return True
  return False

def change_n_to_space(text):
  return text.replace('\n', ' ')



In [None]:
df_train = pd.read_csv("train.csv")
df_validation = pd.read_csv("validation.csv")
df_test = pd.read_csv("test.csv")

In [None]:
# Get the lists of contents and their labels.
### TRAIN ###
contents_train = df_train.full_text.values
labels_train = df_train.label.values


### VALIDATION ###
contents_validation = df_validation.full_text.values
labels_validation = df_validation.label.values

### TEST ###
contents_test = df_test.full_text.values
labels_test = df_test.label.values

In [None]:
def clean_text (text):
  text = text.lower()
  text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text

contents_train = [clean_text(text) for text in contents_train]
contents_validation = [clean_text(text) for text in contents_validation]
contents_test = [clean_text(text) for text in contents_test]


In [None]:
# fasttext embeddings for the dataset
! pip install pandas numpy scikit-learn gensim nltk

# Tokenization using nltk

import nltk

nltk.download('punkt')
from gensim.models import FastText
from nltk.tokenize import word_tokenize

print("TRAIN")
# Print the original sentence.
print(' Original: ', contents_train[0])
contents_train_tokenized = [word_tokenize(text) for text in contents_train]
print (contents_train_tokenized[0])

print("VALIDATION")
# Print the original sentence.
print(' Original: ', contents_validation[0])
contents_validation_tokenized = [word_tokenize(text) for text in contents_validation]
print (contents_validation_tokenized[0])

print("TEST")
# Print the original sentence.
print(' Original: ', contents_test[0])
contents_test_tokenized = [word_tokenize(text) for text in contents_test]
print (contents_test_tokenized[0])






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TRAIN
 Original:  dosarul gazelor ieftine adriean videanu a fost arestat preventiv 
['dosarul', 'gazelor', 'ieftine', 'adriean', 'videanu', 'a', 'fost', 'arestat', 'preventiv']
VALIDATION
 Original:  se mai gasesc cinemauri clasice in propia cladire nu incluse intr un mall 
['se', 'mai', 'gasesc', 'cinemauri', 'clasice', 'in', 'propia', 'cladire', 'nu', 'incluse', 'intr', 'un', 'mall']
TEST
 Original:  salut reddit a a cum spune i username ul i m looking for guidance sunt o persoan nev z toare care se chinuie s i g seasc job de mai bine de doi ani i nimic salut n primul r nd mi cer scuze dar poate ave i voi idei cum a putea s mi g sesc un job c eu nu tiu ce s mai fac i unde s aplic i mi cam pierd speran a n mod normal nu mi place s spun c sunt nev z toare automat oamenii spun f masaj apreciez persoanele care fac a a ceva dar nu e pentru mine a a cum it nu e pentru toat lumea problema e urm toarea aplic peste tot ejobs olx facebook linkedin etc lucrurile decurg bine p n c nd ntreb dac a

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(y_pred, y_true):

  y_pred = y_pred.flatten()
  macro_f1 = f1_score(y_true, y_pred, average='macro')
  # Popular - label 1
  TP_pop = 0 # y_true = 1 si y_pred = 1
  FP_pop = 0 # y_true = 0 si y_pred = 1
  FN_pop = 0 # y_true = 1 si y_pred = 0
  # Unpopular - label 0
  TP_unpop = 0 # y_true = 0 si y_pred = 0
  FP_unpop = 0 # y_true = 1 si y_pred = 0
  FN_unpop = 0 # y_true = 0 si y_pred = 1
  n = len(y_pred)
  for index in range(n):
    # Popular
    if y_true[index] == 1:
      if y_pred[index] == 1:
        TP_pop += 1
      if y_pred[index] == 0:
        FN_pop += 1
        FP_unpop += 1
    else:
    # Unpopular y_true[index] == 0
      if y_pred[index] == 0:
        TP_unpop += 1
      if y_pred[index] == 1:
        FN_unpop += 1
        FP_pop += 1

  P_pop = 0 if TP_pop + FP_pop == 0 else TP_pop / (TP_pop + FP_pop)
  R_pop = 0 if TP_pop + FN_pop == 0 else TP_pop / (TP_pop + FN_pop)
  P_unpop = 0 if TP_unpop + FP_unpop == 0 else TP_unpop / (TP_unpop + FP_unpop)
  R_unpop = 0 if TP_unpop + FN_unpop == 0 else TP_unpop / (TP_unpop + FN_unpop)

  return (macro_f1, P_pop, R_pop, P_unpop, R_unpop)

In [None]:
# random forrest classification

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Use TF-IDF to convert text data to numerical features

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=5,
                             max_df=0.8,
                             sublinear_tf=True,
                             use_idf=True)

contents_train_tokenized = [' '.join(text) for text in contents_train_tokenized]

contents_validation_tokenized = [' '.join(text) for text in contents_validation_tokenized]

contents_test_tokenized = [' '.join(text) for text in contents_test_tokenized]

# Apply the vectorizer
contents_train_tokenized = vectorizer.fit_transform(contents_train_tokenized)
contents_validation_tokenized = vectorizer.transform(contents_validation_tokenized)
contents_test_tokenized = vectorizer.transform(contents_test_tokenized)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on training data
rf.fit(contents_train_tokenized, labels_train)

In [None]:
# Use the forest's predict method on the validation data

predictions_validation = rf.predict(contents_validation_tokenized)

# Calculate the absolute errors

errors = abs(predictions_validation - labels_validation)

print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

predictions_validation.flatten()
# # Evaluate using compute_metrics function
macro_f1, P_popular, R_popular, P_unpopular, R_unpopular = compute_metrics(predictions_validation, labels_validation)
accuracy_validation = accuracy_score(labels_validation, predictions_validation)
print("VALIDATION")
print("Accuracy: ", accuracy_validation)
print("Macro F1: ", macro_f1)
print("Popular class: ")
print("Precision: ", P_popular)
print("Recall: ", R_popular)
print("Unpopular class: ")
print("Precision: ", P_unpopular)
print("Recall: ", R_unpopular)

Mean Absolute Error: 0.35 degrees.
VALIDATION
Accuracy:  0.6535731187884525
Macro F1:  0.6395340815869939
Popular class: 
Precision:  0.7519500780031201
Recall:  0.45687203791469194
Unpopular class: 
Precision:  0.610733695652174
Recall:  0.8497164461247637


In [None]:
# Prediction on test set

predictions_test = rf.predict(contents_test_tokenized)

# Calculate the absolute errors

errors = abs(predictions_test - labels_test)

print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Evaluate using compute_metrics function
macro_f1, P_popular, R_popular, P_unpopular, R_unpopular = compute_metrics(predictions_test, labels_test)
accuracy_test = accuracy_score(labels_test, predictions_test)
print("TEST")
print("Accuracy: ", accuracy_test)
print("Macro F1: ", macro_f1)
print("Popular class: ")
print("Precision: ", P_popular)
print("Recall: ", R_popular)
print("Unpopular class: ")
print("Precision: ", P_unpopular)
print("Recall: ", R_unpopular)

Mean Absolute Error: 0.42 degrees.
TEST
Accuracy:  0.5759897828863346
Macro F1:  0.5729864179396502
Popular class: 
Precision:  0.5897959183673469
Recall:  0.4931740614334471
Unpopular class: 
Precision:  0.5661066471877283
Recall:  0.6584536958368734
