In [1]:
# Replace nltk stemmer with Polish stemmer version from pystempel repository (https://github.com/dzieciou/pystempel)

import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from stempel import StempelStemmer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import pickle

In [2]:
sport = pd.read_csv(r"headers_corp.csv",encoding='utf-8').iloc[:5]
nonsport = pd.read_csv(r"headers_corp.csv",encoding='utf-8').iloc[-5:]
Corpus = pd.concat([sport,nonsport]).reset_index(drop=True)

In [3]:
# Preprocess text and format to lower case.

def preprocess(corpus):
    corpus = Corpus
    Corpus['text'].dropna(inplace=True)
    a = Corpus['text'].str.lower()
    Corpus['text'] = a
    # Tokenize text
    Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

    # Remove polish stopwords and non alphabetical
    # Stem rest of the words

    for index,entry in enumerate(Corpus['text']):

        Final_words = []
        stemmer = StempelStemmer.default()

        for word in entry:
            # it's necessary to add polish stopwords to nltk_data/corpora/stopwords
            if word not in stopwords.words('polish') and word.isalpha():
                word_Final = stemmer.stem(word)
                Final_words.append(word_Final)

        # The final preprocessed set of words for each iteration will be stored in 'text_final'
        Corpus.loc[index,'text_final'] = str(Final_words)
        

In [4]:
preprocess(Corpus)
Corpus.text_final

0    ['my', 'ekspresowy', 'wygrać', 'poleka', 'debi...
1     ['historyczny', 'triumf', 'hubertaa', 'hurkacz']
2    ['wojciecha', 'fibak', 'życzyć', 'hubertow', '...
3    ['hubert', 'hurkacz', 'benoić', 'pairy', 'rela...
4    ['marić', 'lewandowski', 'pobić', 'rekordy', '...
5    ['ekspert', 'sytuacja', 'carlitosa', 'vuković'...
6    ['grażyna', 'torbicki', 'rezerwacja', 'polańsk...
7    ['ślub', 'andrzeja', 'wron', 'zofia', 'zborows...
8    ['siostry', 'ucieknąć', 'swój', 'saudyjski', '...
9    ['rudzki', 'gerrć', 'bardzo', 'obawiać', 'możl...
Name: text_final, dtype: object

In [5]:
# Split data to train dataset and validation dataset 
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)

In [6]:
# Labelizing

Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [7]:
# Transform datasets to vectors with TFIDF vectorizer

Tfidf_vect = TfidfVectorizer(max_features=1000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [8]:
Test_X_Tfidf

<3x67 sparse matrix of type '<class 'numpy.float64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [9]:
import pickle

f = open('finalized_model.sav', 'rb')
model = pickle.load(f)
f.close()



In [13]:
predictions = model.predict(Test_X_Tfidf)

ValueError: dimension mismatch

In [None]:
# Fit the training dataset on the Naive Bayes classifier

model = naive_bayes.MultinomialNB()
model.fit(Train_X_Tfidf,Train_Y)

In [5]:
# save model to pickle

filename = 'finalized_model_01.sav'
pickle.dump(model, open(filename, 'wb'))