In [22]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score,accuracy_score,roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.metrics import classification_report


In [2]:
#It contains 3 labels which is why it is multi classification problem
authors_df=pd.read_csv("Authors_dataset.csv")
authors_df

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
...,...,...,...
19574,id17718,"I could have fancied, while I looked at it, th...",EAP
19575,id08973,The lids clenched themselves together as if in...,EAP
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP
19577,id17513,"For an item of news like this, it strikes us i...",EAP


In [3]:
authors_df['author'].value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [4]:
#dropping o/p feature
y=authors_df['author']
authors_df.drop(columns='author',inplace=True)

# Train_Test_split

In [5]:
X_train,X_test,y_train,y_test=train_test_split(authors_df,y,test_size=0.2,random_state=42)

# Tokenizing

In [6]:
tokenizer=RegexpTokenizer(r'\w+')

# stopwords

In [7]:
en_sw=stopwords.words('english')
en_sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Lemmatization

In [8]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()
tfidf=TfidfVectorizer()

In [9]:
#preproccesssing 
def text_processing(text):
    tokens=tokenizer.tokenize(text)
    pure_tokens=[token.lower() for token in tokens if token.lower() not in en_sw]
    lemmatized_tokens=[lemmatizer.lemmatize(token,pos='v') for token in pure_tokens]
    
    return ' '.join(lemmatized_tokens)

In [10]:
authors_df.columns

Index(['id', 'text'], dtype='object')

In [11]:
#applying def function(tokens,stemming,lemmatization) to train_test data

X_train['text']=X_train['text'].apply(text_processing)
X_test['text']=X_test['text'].apply(text_processing)

# TF-IDF

In [12]:
#tf-idf vectorizer(docment*term matrix)
tfidf=TfidfVectorizer()
train_tfidf=tfidf.fit_transform(X_train['text'])
test_tfidf=tfidf.transform(X_test['text']) 

In [13]:
train_tfidf

<15663x17914 sparse matrix of type '<class 'numpy.float64'>'
	with 199016 stored elements in Compressed Sparse Row format>

# Model

In [14]:
mnb=MultinomialNB()
mnb.fit(train_tfidf,y_train)
mnb_pred=mnb.predict(test_tfidf)
mnb_pred

array(['EAP', 'MWS', 'MWS', ..., 'HPL', 'HPL', 'EAP'], dtype='<U3')

In [15]:
confusion_matrix(y_test,mnb_pred) 

array([[1385,   65,  120],
       [ 217,  779,   75],
       [ 204,   42, 1029]], dtype=int64)

In [16]:
accuracy_score(y_test,mnb_pred)

0.8153728294177732

In [24]:
print(classification_report(y_test,mnb_pred))

              precision    recall  f1-score   support

         EAP       0.77      0.88      0.82      1570
         HPL       0.88      0.73      0.80      1071
         MWS       0.84      0.81      0.82      1275

    accuracy                           0.82      3916
   macro avg       0.83      0.81      0.81      3916
weighted avg       0.82      0.82      0.81      3916

