In [1]:
!pip install pymorphy2



In [2]:
!pip install catboost



In [3]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import pymorphy2
import nltk
import string

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, classification_report
from wordcloud import WordCloud, STOPWORDS
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
LEMMATIZER = WordNetLemmatizer()
PORTER_STEMMER = PorterStemmer()
ENGLISH_STOP_WORDS = set(stopwords.words('english'))

In [6]:
from google.colab import drive, files

In [7]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
uploaded = files.upload()

Saving train.csv to train.csv


In [9]:
df = pd.read_csv('/content/train.csv', index_col=0)

In [10]:
df.head()

Unnamed: 0,Text,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41159 entries, 0 to 41156
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       41158 non-null  object
 1   Sentiment  41155 non-null  object
dtypes: object(2)
memory usage: 964.7+ KB


In [12]:
df.describe()

Unnamed: 0,Text,Sentiment
count,41158,41155
unique,41158,5
top,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Positive
freq,1,11422


### Preprocessing

In [13]:
df.isna().sum().sum()

5

In [14]:
df.dropna(inplace=True)

In [15]:
df.isna().sum().sum()

0

In [16]:
def text_preprocessing(text, stop_words=None, stem=None, lemmatizer=None):
    text = re.sub(r'http\S+', '', text)

    text = text.translate(str.maketrans("", "", string.punctuation))
    text = ''.join([i for i in text if not i.isdigit()])
    text = text.lower()

    if stop_words is not None:
        words = nltk.word_tokenize(text)
        filtered_words = [word for word in words if word not in stop_words]
        text = " ".join(filtered_words)

    if stem is not None:
        words = nltk.word_tokenize(text)
        stemmed_words = [stem.stem(word) for word in words]
        text = " ".join(stemmed_words)

    if lemmatizer is not None:
        words = nltk.word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        text = " ".join(lemmatized_words)

    return text

In [17]:
df['Text'] = df['Text'].apply(text_preprocessing, args=(ENGLISH_STOP_WORDS, PORTER_STEMMER, LEMMATIZER,))

In [18]:
df = df[df['Text'] != '']

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41139 entries, 0 to 41156
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       41139 non-null  object
 1   Sentiment  41139 non-null  object
dtypes: object(2)
memory usage: 964.2+ KB


In [20]:
df.head()

Unnamed: 0,Text,Sentiment
0,menyrbi philgahan chrisitv,Neutral
1,advic talk neighbour famili exchang phone numb...,Positive
2,coronaviru australia woolworth give elderli di...,Positive
3,food stock one empti plea dont panic enough fo...,Positive
4,readi go supermarket covid outbreak im paranoi...,Extremely Negative


### Naive classification

In [21]:
df_naive = df.copy()

In [22]:
X_train_naive, X_test_naive, y_train_naive, y_test_naive = train_test_split(df_naive.Text, df_naive.Sentiment, shuffle=True, test_size=0.25, random_state=42)

In [23]:
vectorizer = CountVectorizer()

In [24]:
train_features = vectorizer.fit_transform(X_train_naive)
test_features = vectorizer.transform(X_test_naive)

In [25]:
train_features.shape

(30854, 39615)

In [26]:
test_features.shape

(10285, 39615)

In [27]:
classifier = MultinomialNB()

In [28]:
classifier.fit(train_features, y_train_naive)

In [29]:
predictions = classifier.predict(test_features)

In [30]:
print(classification_report(y_test_naive, predictions))

                    precision    recall  f1-score   support

Extremely Negative       0.64      0.30      0.41      1394
Extremely Positive       0.63      0.37      0.47      1669
          Negative       0.41      0.54      0.46      2439
           Neutral       0.69      0.30      0.42      1948
          Positive       0.40      0.65      0.50      2835

          accuracy                           0.47     10285
         macro avg       0.56      0.43      0.45     10285
      weighted avg       0.53      0.47      0.46     10285



### Split

In [31]:
vectorizer = CountVectorizer()
X_train_numeric = vectorizer.fit_transform(df.Text)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_train_numeric, df.Sentiment, shuffle=True, test_size=0.25, random_state=42)

In [33]:
X_train.shape

(30854, 48083)

In [34]:
y_train.shape

(30854,)

In [35]:
y_test.shape

(10285,)

In [36]:
X_train_pool = Pool(data=X_train, label=y_train, cat_features=[])

In [37]:
X_evaluation_pool = Pool(data=X_test, label=y_test, cat_features=[])

In [38]:
model = CatBoostClassifier(iterations=1000, early_stopping_rounds=100)

In [39]:
model.fit(X=X_train_pool, eval_set=X_evaluation_pool)

Learning rate set to 0.115642
0:	learn: 1.5811319	test: 1.5816213	best: 1.5816213 (0)	total: 540ms	remaining: 8m 59s
1:	learn: 1.5567409	test: 1.5573827	best: 1.5573827 (1)	total: 955ms	remaining: 7m 56s
2:	learn: 1.5390397	test: 1.5400349	best: 1.5400349 (2)	total: 1.36s	remaining: 7m 33s
3:	learn: 1.5238802	test: 1.5246053	best: 1.5246053 (3)	total: 1.79s	remaining: 7m 26s
4:	learn: 1.5088519	test: 1.5104393	best: 1.5104393 (4)	total: 2.17s	remaining: 7m 12s
5:	learn: 1.4979277	test: 1.4990436	best: 1.4990436 (5)	total: 2.6s	remaining: 7m 11s
6:	learn: 1.4875981	test: 1.4895932	best: 1.4895932 (6)	total: 3.01s	remaining: 7m 7s
7:	learn: 1.4780028	test: 1.4803751	best: 1.4803751 (7)	total: 3.42s	remaining: 7m 4s
8:	learn: 1.4699273	test: 1.4721778	best: 1.4721778 (8)	total: 3.82s	remaining: 7m
9:	learn: 1.4618692	test: 1.4646983	best: 1.4646983 (9)	total: 4.2s	remaining: 6m 56s
10:	learn: 1.4557470	test: 1.4587245	best: 1.4587245 (10)	total: 4.6s	remaining: 6m 53s
11:	learn: 1.4482809

<catboost.core.CatBoostClassifier at 0x7862cae92e60>

In [40]:
y_pred = model.predict(X_test)

In [41]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.6201263976665046


In [42]:
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")

Precision: 0.6306122450634108


In [43]:
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")

Recall: 0.6201263976665046
