In [1]:
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    #remove tag
    text = re.sub('@[^\s]+','', text)

    #remove link
    text = re.sub('((www\.[^\s]+)|(http?://[^\s]+))','',text)

    # Remove punctuation
    text = re.sub("[^a-zA-Z]", " ", text)

    #lowercase and tokenize
    tokens = nltk.word_tokenize(text.lower())

    #remove stopwrods
    tokens = [word for word in tokens if word not in stop_words]  

    #stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)
    
def saveTxt(file_name, df):
    text_list = df['text'].astype(str).tolist()
    label_list = df['sentiment'].astype(str).tolist()
    
    combined_list = [f"__label__{label} {text.strip()}\n" for label, text in zip(label_list, text_list)]
    
    with open(file_name,'w') as file:
        file.writelines(combined_list)

In [3]:
df_train = pd.read_csv("../train.csv")
df_train['text'] = df_train['text'].astype(str).apply(preprocess)
df_train.dropna(inplace=True)
df_train.drop_duplicates(subset='text', inplace=True)

df_test = pd.read_csv("../test.csv")
df_test['text'] = df_test['text'].astype(str).apply(preprocess)
df_test.dropna(inplace=True)
df_test.drop_duplicates(subset='text', inplace=True)

In [4]:
saveTxt('train.txt', df_train)
saveTxt('test.txt', df_test)

In [5]:
model = fasttext.train_supervised(input='train.txt', autotuneValidationFile='test.txt', autotuneModelSize="2M")
result = model.test('test.txt')
print(result)

Progress: 100.0% Trials:   21 Best score:  0.718499 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  18578
Number of labels: 3
Progress: 100.0% words/sec/thread:  129465 lr:  0.000000 avg.loss:  0.717932 ETA:   0h 0m 0s
Progress: 100.0% words/sec/thread:  128283 lr:  0.000000 avg.loss:  0.550894 ETA:   0h 0m 0s


(3492, 0.7164948453608248, 0.7164948453608248)


In [6]:
model.save_model("model.ftz")