In [1]:
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nanta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df_train = pd.read_csv("./Poem_classification - train_data.csv")
df_test = pd.read_csv("./Poem_classification - test_data.csv")

In [3]:
df_train.head()

Unnamed: 0,Genre,Poem
0,Music,
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...


In [4]:
df_train.shape

(841, 2)

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841 entries, 0 to 840
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Genre   841 non-null    object
 1   Poem    837 non-null    object
dtypes: object(2)
memory usage: 13.3+ KB


In [6]:
df_train.isnull().sum()

Genre    0
Poem     4
dtype: int64

In [7]:
df_train = df_train.dropna(axis=0)

In [8]:
df_train.isnull().sum()

Genre    0
Poem     0
dtype: int64

In [9]:
df_train.Genre.value_counts()

Genre
Music          238
Death          231
Environment    227
Affection      141
Name: count, dtype: int64

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nanta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
def count_text(text):
    tokens = nltk.word_tokenize(text)
    patt = re.compile("[a-zA-Z0-9]+")
    count = 0
    for token in tokens:
        if patt.match(token):
            count +=1
    return count

In [12]:
df_train["length"] = df_train.Poem.apply(count_text)

In [13]:
df_train.head()

Unnamed: 0,Genre,Poem,length
1,Music,In the thick brushthey spend the...,26
2,Music,Storms are generous. ...,26
3,Music,—After Ana Mendieta Did you carry around the ...,40
4,Music,for Aja Sherrard at 20The portent may itself ...,36
5,Music,"for Bob Marley, Bavaria, November 1980 Here i...",37


In [14]:
nltk.word_tokenize(df_train["Poem"][1])

['In',
 'the',
 'thick',
 'brushthey',
 'spend',
 'the',
 'hottest',
 'part',
 'of',
 'the',
 'day',
 ',',
 'soaking',
 'their',
 'hoovesin',
 'the',
 'trickle',
 'of',
 'mountain',
 'water',
 'the',
 'ravine',
 'hoardson',
 'behalf',
 'of',
 'the',
 'oleander',
 '.']

In [15]:
df_train.length.describe()

count    837.000000
mean      47.861410
std       12.887808
min        3.000000
25%       44.000000
50%       52.000000
75%       56.000000
max       70.000000
Name: length, dtype: float64

In [16]:
def tokenizer_text(text):
    tokens = nltk.word_tokenize(text)
    patt = re.compile("[a-zA-Z0-9]+")
    token = [x for x in tokens if patt.match(x)]
    return "|".join(token)
    

In [17]:
tokenizer_text("I love to play football")

'I|love|to|play|football'

In [18]:
df_train["tokens"] = df_train["Poem"].apply(tokenizer_text)
df_train.head()

Unnamed: 0,Genre,Poem,length,tokens
1,Music,In the thick brushthey spend the...,26,In|the|thick|brushthey|spend|the|hottest|part|...
2,Music,Storms are generous. ...,26,Storms|are|generous|Something|so|easy|to|surre...
3,Music,—After Ana Mendieta Did you carry around the ...,40,Ana|Mendieta|Did|you|carry|around|the|matin|st...
4,Music,for Aja Sherrard at 20The portent may itself ...,36,for|Aja|Sherrard|at|20The|portent|may|itself|b...
5,Music,"for Bob Marley, Bavaria, November 1980 Here i...",37,for|Bob|Marley|Bavaria|November|1980|Here|is|t...


In [19]:
df_train_last = df_train[["Genre", "tokens"]]

In [20]:
df_train_last

Unnamed: 0,Genre,tokens
1,Music,In|the|thick|brushthey|spend|the|hottest|part|...
2,Music,Storms|are|generous|Something|so|easy|to|surre...
3,Music,Ana|Mendieta|Did|you|carry|around|the|matin|st...
4,Music,for|Aja|Sherrard|at|20The|portent|may|itself|b...
5,Music,for|Bob|Marley|Bavaria|November|1980|Here|is|t...
...,...,...
836,Environment,Why|make|so|much|of|fragmentary|blue|In|here|a...
837,Environment,Woman|I|wish|I|did|n't|know|your|name|What|cou...
838,Environment,Yonder|to|the|kiosk|beside|the|creek|Paddle|th...
839,Environment,You|come|to|fetch|me|from|my|work|to-night|Whe...


# Using CountVectorizer

In [21]:
# Count Vectorizer
vectorizer = CountVectorizer()
text_feature = vectorizer.fit_transform(df_train["Poem"])

In [22]:
df_train["Genre"].value_counts(normalize=True)

Genre
Music          0.284349
Death          0.275986
Environment    0.271207
Affection      0.168459
Name: proportion, dtype: float64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(text_feature, df_train["Genre"], random_state=0, test_size=0.2)

In [24]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, random_state=0, test_size=0.2)

In [25]:
lr = LogisticRegression(multi_class="multinomial", max_iter=1000)

In [26]:
lr.fit(X_tr, y_tr)

In [27]:
model_predicted = lr.predict(X_val)

In [28]:
print(classification_report(model_predicted, y_val))

              precision    recall  f1-score   support

   Affection       0.38      0.42      0.40        19
       Death       0.32      0.39      0.35        28
 Environment       0.53      0.50      0.51        42
       Music       0.49      0.42      0.45        45

    accuracy                           0.44       134
   macro avg       0.43      0.43      0.43       134
weighted avg       0.45      0.44      0.44       134



In [29]:
weight = pd.DataFrame(lr.coef_, columns=vectorizer.get_feature_names_out(), index=lr.classes_)

In [30]:
weight

Unnamed: 0,00,000,10maggie,12,13,15,17,1865there,1908,1913,...,zenith,zephyrs,zest,zhashagiwag,zhingwaakwag,zipperedshut,zither,zithers,zoom,ȟe
Affection,0.0,-0.029458,-0.028811,0.0,-0.014899,-0.027415,-0.027415,0.0,-0.029913,-0.013307,...,0.0,0.0,0.000386,-0.007504,-0.007504,-0.008301,-0.019078,0.109487,0.0,-0.030173
Death,0.0,0.021294,-0.041012,0.0,0.044497,-0.029181,-0.029181,0.0,-0.037152,0.265808,...,0.0,0.0,-8.7e-05,-0.01669,-0.01669,0.046234,-0.031246,-0.052556,0.0,-0.027047
Environment,0.0,-0.033395,0.086605,0.0,-0.013744,-0.013348,-0.013348,0.0,-0.0327,-0.108993,...,0.0,0.0,-0.000159,0.037926,0.037926,-0.000802,-0.046615,-0.011131,0.0,0.120179
Music,0.0,0.041559,-0.016781,0.0,-0.015855,0.069944,0.069944,0.0,0.099764,-0.143509,...,0.0,0.0,-0.00014,-0.013732,-0.013732,-0.03713,0.09694,-0.0458,0.0,-0.062959


In [31]:
weight.shape

(4, 8437)

In [32]:
weight.transpose().sort_values("Music", ascending=False)

Unnamed: 0,Affection,Death,Environment,Music
makes,-0.251332,-0.113942,-0.090153,0.455427
loyalty,-0.053721,-0.153104,-0.225895,0.432720
into,-0.308451,0.009108,-0.102271,0.401614
next,-0.141189,-0.103317,-0.155492,0.399998
take,-0.027884,-0.127450,-0.229368,0.384703
...,...,...,...,...
rain,0.133497,-0.177772,0.416806,-0.372530
by,0.296442,0.042514,0.039719,-0.378675
there,0.057733,0.514168,-0.165964,-0.405937
long,0.037966,0.021892,0.348202,-0.408060


In [33]:
weight.transpose().sort_values("Affection", ascending=False)

Unnamed: 0,Affection,Death,Environment,Music
love,0.735769,-0.430292,-0.242356,-0.063121
that,0.547368,-0.245017,-0.169037,-0.133315
thou,0.435137,0.033752,-0.117813,-0.351076
thee,0.432589,-0.062716,-0.187189,-0.182683
at,0.424595,-0.096800,-0.217176,-0.110618
...,...,...,...,...
because,-0.310425,0.014288,-0.042654,0.338791
even,-0.314570,0.084691,0.044806,0.185074
as,-0.423131,0.286410,0.108194,0.028527
has,-0.437016,0.421986,0.021305,-0.006274


In [34]:
weight.transpose().sort_values("Death", ascending=False)

Unnamed: 0,Affection,Death,Environment,Music
dead,-0.437043,0.777757,-0.127880,-0.212834
death,-0.087093,0.633873,-0.304908,-0.241873
silver,-0.037831,0.576952,-0.203375,-0.335746
died,-0.101038,0.522389,-0.177046,-0.244305
there,0.057733,0.514168,-0.165964,-0.405937
...,...,...,...,...
dream,0.222011,-0.342037,0.212980,-0.092954
moon,-0.105673,-0.352311,0.335276,0.122708
then,0.199887,-0.384943,-0.152529,0.337585
little,0.145445,-0.403244,0.013542,0.244257


In [35]:
weight.transpose().sort_values("Environment", ascending=False)

Unnamed: 0,Affection,Death,Environment,Music
trees,-0.265992,0.054392,0.564092,-0.352492
sun,-0.064689,-0.192233,0.553996,-0.297074
grass,-0.280988,-0.176204,0.500644,-0.043452
sky,-0.298276,-0.254365,0.471902,0.080739
feeds,-0.072031,-0.166331,0.447295,-0.208932
...,...,...,...,...
their,-0.094509,0.117021,-0.401558,0.379046
its,0.297102,0.086288,-0.406781,0.023391
for,-0.208473,0.505092,-0.408818,0.112200
who,0.278180,-0.039075,-0.490899,0.251794


In [36]:
weight.transpose().loc["love"]

Affection      0.735769
Death         -0.430292
Environment   -0.242356
Music         -0.063121
Name: love, dtype: float64

<h3>Bias of Class

In [37]:
bias_cl = pd.DataFrame(lr.intercept_, index=lr.classes_)
bias_cl

Unnamed: 0,0
Affection,-1.828164
Death,-0.037358
Environment,0.930659
Music,0.934863


# Using Dict vectorizer

In [38]:
# feature engineering
def tokenizer_text(text):
    return {x:1 for x in nltk.word_tokenize(text)}

def feature_text_length(text):
    feature_dict = {}
    feature_dict["text length"] = len(text)
    for token in nltk.word_tokenize(text):
        if token not in feature_dict:
            feature_dict[token] = 0
            feature_dict[token] += 1
    return feature_dict

In [39]:
vectorizer_2 = DictVectorizer(sparse=True)
df_train_dict = df_train["Poem"].apply(feature_text_length)

In [40]:
feature_vector = vectorizer_2.fit_transform(df_train_dict)
feature_vector

<837x9765 sparse matrix of type '<class 'numpy.float64'>'
	with 36254 stored elements in Compressed Sparse Row format>

In [41]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(feature_vector, df_train["Genre"], random_state=0, test_size=0.2)

In [42]:
X_tr_2, X_val_2, y_tr_2, y_val_2 = train_test_split(X_train_2, y_train_2, random_state=0, test_size=0.2)

In [46]:
lr_dict = LogisticRegression(max_iter=1000)

In [47]:
lr_dict.fit(X_tr_2, y_tr_2)

In [49]:
model_predicted_2 = lr_dict.predict(X_val_2)

In [50]:
print(classification_report(y_val_2, model_predicted_2))

              precision    recall  f1-score   support

   Affection       0.55      0.29      0.37        21
       Death       0.47      0.41      0.44        34
 Environment       0.44      0.47      0.46        40
       Music       0.52      0.67      0.58        39

    accuracy                           0.49       134
   macro avg       0.49      0.46      0.46       134
weighted avg       0.49      0.49      0.48       134



In [51]:
weight2 = pd.DataFrame(lr_dict.coef_, columns=vectorizer_2.get_feature_names_out(), index=lr_dict.classes_)
weight2

Unnamed: 0,!,#,%,&,','','T,'What,'apothecary,'d,...,—then,—they,—those,—water,—with,‘,’,“,”,…
Affection,0.178139,0.0,-0.011992,0.116309,-0.106499,-0.095625,-0.009259,-0.021885,-0.013274,-0.022006,...,0.142078,0.0,0.051743,0.0,-0.007866,0.039648,-0.327699,-0.059097,-0.060639,0.0
Death,-0.146599,0.0,-0.033805,-0.056638,0.001157,0.078755,0.053795,0.052742,-0.029112,0.293854,...,-0.056091,0.0,-0.039372,0.0,0.061201,0.010951,-0.12815,-0.281212,-0.293385,0.0
Environment,0.564251,0.0,-0.007894,-0.273701,0.067554,0.153711,-0.029848,-0.006046,-0.040222,-0.033915,...,-0.050927,0.0,-0.009861,0.0,-0.038016,0.004443,0.043121,0.287072,0.306196,0.0
Music,-0.595791,0.0,0.053691,0.21403,0.037789,-0.13684,-0.014688,-0.024811,0.082608,-0.237933,...,-0.03506,0.0,-0.002511,0.0,-0.015319,-0.055043,0.412728,0.053237,0.047828,0.0


In [52]:
weight2.transpose().sort_values("Affection", ascending=False)

Unnamed: 0,Affection,Death,Environment,Music
love,0.779683,-0.453735,-0.245880,-0.080069
And,0.487916,-0.036177,-0.138305,-0.313435
When,0.483631,-0.000701,-0.318208,-0.164722
thee,0.481505,-0.136593,-0.199086,-0.145827
me,0.469901,-0.115558,-0.229990,-0.124354
...,...,...,...,...
sky,-0.341524,-0.255965,0.414692,0.182797
some,-0.354652,0.210970,-0.034190,0.177872
has,-0.393015,0.388009,0.020654,-0.015648
dead,-0.461508,0.721345,-0.136707,-0.123130


In [54]:
bias_dict_2 = pd.DataFrame(lr_dict.intercept_, index=lr_dict.classes_)
bias_dict_2

Unnamed: 0,0
Affection,-1.209077
Death,0.014343
Environment,0.471515
Music,0.723218


In [56]:
print(classification_report(y_val_2, model_predicted_2))

              precision    recall  f1-score   support

   Affection       0.55      0.29      0.37        21
       Death       0.47      0.41      0.44        34
 Environment       0.44      0.47      0.46        40
       Music       0.52      0.67      0.58        39

    accuracy                           0.49       134
   macro avg       0.49      0.46      0.46       134
weighted avg       0.49      0.49      0.48       134



In [55]:
print(classification_report(y_val, model_predicted))

              precision    recall  f1-score   support

   Affection       0.42      0.38      0.40        21
       Death       0.39      0.32      0.35        34
 Environment       0.50      0.53      0.51        40
       Music       0.42      0.49      0.45        39

    accuracy                           0.44       134
   macro avg       0.43      0.43      0.43       134
weighted avg       0.44      0.44      0.44       134



I select f1-score to validate model f1-score in model2 slightly higher than model1 so i use Dict vectorizer instead of CountVectorizer

In [60]:
predicted = lr_dict.predict(X_test_2)

In [61]:
print(classification_report(y_test_2, predicted))

              precision    recall  f1-score   support

   Affection       0.18      0.09      0.12        33
       Death       0.28      0.18      0.22        51
 Environment       0.42      0.51      0.46        39
       Music       0.42      0.67      0.52        45

    accuracy                           0.37       168
   macro avg       0.32      0.36      0.33       168
weighted avg       0.33      0.37      0.33       168



In [62]:
# check with model1
predicted2 = lr.predict(X_test)
print(classification_report(y_test, predicted2))

              precision    recall  f1-score   support

   Affection       0.35      0.24      0.29        33
       Death       0.26      0.20      0.22        51
 Environment       0.31      0.38      0.34        39
       Music       0.34      0.44      0.39        45

    accuracy                           0.32       168
   macro avg       0.32      0.32      0.31       168
weighted avg       0.31      0.32      0.31       168



In [63]:
df_test_dict = df_test["Poem"].apply(feature_text_length)
df_test_vector = vectorizer_2.transform(df_test_dict)

In [64]:
predicted_test = lr_dict.predict(df_test_vector)

In [66]:
dif = pd.DataFrame({"Predict": predicted_test, "Real": df_test["Genre"]})
dif

Unnamed: 0,Predict,Real
0,Death,Music
1,Music,Music
2,Music,Music
3,Music,Music
4,Environment,Music
...,...,...
145,Music,Environment
146,Environment,Environment
147,Environment,Environment
148,Environment,Environment
