The aim of this project is to develop a machine learning model capable of accurately predicting the sentiment of game reviews. By leveraging web scraping techniques, the project collects a substantial dataset of game reviews from a specific website. The focus is on preprocessing the collected data through various text cleaning and feature extraction techniques. With the help of classification models like Multinomial Naive Bayes and AdaBoost with Random Forest, the project aims to train and optimize models that can classify reviews as positive or negative sentiment. The ultimate goal is to create a reliable sentiment analysis tool specifically tailored for game reviews, enabling users to gain valuable insights into players' opinions and feedback.

#Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/tvpgry

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/tvpgry


In [None]:
!pip install html2text
!pip install -U pip setuptools wheel
!pip install -U spacy
!python3 -m spacy download pl_core_news_lg

In [None]:
from bs4 import BeautifulSoup
import requests
import html2text
import re
import pickle
import pandas as pd
import json
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import pl_core_news_lg
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

In [None]:
nlp = spacy.load('pl_core_news_lg')

In [None]:
url = 'https://www.gry-online.pl/recenzje-gier.asp?STR=1'

content = requests.get(url).content
soup = BeautifulSoup(content, 'html.parser')

In [None]:
print(soup.prettify())

In [None]:
['https://www.gry-online.pl' + url['href'] for url in soup.find('div', {'class':'czyt-lmat-grid'}).find_all('a')]

In [None]:
url = 'https://www.gry-online.pl/S020.asp?ID=11690'

content = requests.get(url).content
soup = BeautifulSoup(content, 'html.parser')

In [None]:
json.loads(soup.find('script', {'type':'application/ld+json'}).get_text())['author']['name']

'Krzysztof Draug Mysiak'

In [None]:
article_content = [str(item) for item in soup.find('article', {'class': 'word-txt'}).find_all('p')]

In [None]:
article_content

In [None]:
author_pattern = re.compile(r'\[(.*?)\]', re.DOTALL)

In [None]:
matching = re.search(author_pattern, text_review)
extracted_text = matching.group(1)

In [None]:
print(extracted_text)

Krzysztof "Draug" Mysiak


In [None]:
merged_text  = ' '.join(article_content)
plain_text = html2text.html2text(merged_text)

In [None]:
text_review = plain_text.replace('\n', ' ')

In [None]:
import re
import time
import requests
from bs4 import BeautifulSoup
import html2text
import json

class TvpGryReviewsScrapper:
    """A class to scrape game reviews from gry-online.pl website."""

    def __init__(self):
        """Initializes the TvpGryReviewsScrapper class."""
        self.url = f'https://www.gry-online.pl/recenzje-gier.asp?'
        self.patterns_removal = {
            re.compile(r'_autor:.*?\s{2}', re.DOTALL): '',
            re.compile(r'\[.*?\]\(.*?\)', re.DOTALL): '',
            re.compile(r'_(.*?)_', re.DOTALL): r'"\1"'
        }
        self.wait_main = time.sleep(2)
        self.wait_submain = time.sleep(1 / 100)

    def parse_page_links(self, page):
        """Fetches and parses the HTML content of a specific page.

        Args:
            page: An integer representing the page number.

        Returns:
            BeautifulSoup: Parsed HTML content.
        """
        try:
            content = requests.get(self.url + f'STR={page}').content
            self.wait_main
            return BeautifulSoup(content, 'html.parser')
        except Exception as e:
            print(f'{e}')

    def get_links(self, content):
        """Extracts the review links from the parsed content.

        Args:
            content (BeautifulSoup): Parsed HTML content.

        Returns:
            list: List of review links.
        """
        return ['https://www.gry-online.pl' + url['href'] for url in content.find('div', {'class': 'czyt-lmat-grid'}).find_all('a')]

    def get_page_review(self, link):
        """Fetches and parses the HTML content of a review page.

        Args:
            link (str): URL of the review page.

        Returns:
            BeautifulSoup: Parsed HTML content.
        """
        try:
            content = requests.get(link).content
            self.wait_submain
            return BeautifulSoup(content, 'html.parser')
        except Exception as e:
            print(f'{e}')

    def get_review_and_author(self, content):
        """Extracts the review text and author from the parsed content.

        Args:
            content (BeautifulSoup): Parsed HTML content.

        Returns:
            tuple: Review text and author.
        """
        article_content = [str(item) for item in content.find('article', {'class': 'word-txt'}).find_all('p')]
        merged_text = ' '.join(article_content)
        plain_text = html2text.html2text(merged_text)
        plain_text = plain_text.replace('\n', ' ').replace('**', '')
        author = json.loads(content.find('script', {'type': 'application/ld+json'}).get_text())['author']['name']
        for pattern, replacement in self.patterns_removal.items():
            plain_text = re.sub(pattern, replacement, plain_text)
        return plain_text, author

    def get_reviews(self, start_page, end_page):
        """Scrapes the reviews within a specified range of pages.

        Args:
            start_page (int): Starting page number.
            end_page (int): Ending page number.

        Returns:
            list: List of dictionaries containing review text and author.
        """
        reviews_data = []
        for page in range(start_page, end_page + 1):
            links_list = self.get_links(self.parse_page_links(page))
            for link in links

In [None]:
tvp_scrapper = TvpGryReviewsScrapper()
reviews_data = tvp_scrapper.get_reviews(1, 25)

In [None]:
df = pd.DataFrame(reviews_data)

In [None]:
df['author'].value_counts()[:3]

Dariusz DM Matusiak                74
Przemysław Zamęcki                 72
Michał Czarny Wilk Grygorcewicz    66
Name: author, dtype: int64

In [None]:
list_reviews = df['review'].to_list()

In [None]:
list_reviews[0]

'Redfall miał być nowym, lepszym Left 4 Dead, ale wyszedł z tego raczej uboższy Far Cry. Tylko czasem daje się gdzieś dostrzec przebłysk geniuszu w budowaniu światów charakterystyczny dla Arkane, które tym razem poległo w nowym dla siebie gatunku.  Recenzja powstała na bazie wersji .  Arkane Studios tworzące kooperacyjnego looter shootera? Pierwsze wzmianki o "Redfallu" nie brzmiały jeszcze podejrzanie. Wcześniej mieliśmy przecież zwrot w stronę roguelike’a w dobrym "Deathloopie" , a parę lat temu współpracę przy kooperacyjnym "Wolfensteinie: Youngblood". Twórcy "Dishonored" i "Preya" widać szukają nowych dróg rozwoju, ale im więcej godzin spędzałem z  "Redfallem" , tym bardziej utwierdzałem się w przekonaniu, że coś gdzieś po drodze poszło nie tak, że chyba miał to być kolejny duży, fabularny FPS, ale ktoś w połowie procesu produkcji zmienił zdanie. I aby grę jeszcze uratować, dorobiono na szybko lootowanie w co-opie.  Bo choć tematem przewodnim "Redfalla" są wampiry, powstał z tego r

#Data

In [None]:
df.to_csv('review_data.csv', index=False)

In [None]:
df = pd.read_csv('review_data.csv')

In [None]:
df_top3 = df.loc[df['author'].isin(df['author'].value_counts().head(3).index.tolist()), ['review', 'author']].reset_index(drop=True)

#Functions

In [None]:
def lemmatize(sentence):
  return ' '.join([token.lemma_ for token in nlp(sentence)])

In [None]:
def remove_stop_words(sentence):
  return ' '.join([token.text for token in nlp(sentence) if not token.is_stop])

In [None]:
def measure_sentence_length(sentence):
    doc = nlp(sentence)
    sentences = list(doc.sents)
    sentence_lengths = [int(len(sent)) for sent in sentences]
    return sum(sentence_lengths)/len(sentence_lengths)

In [None]:
def extract_named_entities(review):
    doc = nlp(review)
    named_entities = [ent.text for ent in doc.ents]
    return len(named_entities)

#Data preparation

In [None]:
X = df_top3['review']
y = df_top3['author']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train = pd.DataFrame({'review': X_train})
X_test = pd.DataFrame({'review': X_test})


In [None]:
y_train = pd.DataFrame({'author': y_train})
y_test = pd.DataFrame({'author': y_test})

In [None]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [None]:
X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  169 non-null    object
dtypes: object(1)
memory usage: 1.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  43 non-null     object
dtypes: object(1)
memory usage: 472.0+ bytes


In [None]:
X_train['lemma'] = X_train['review'].map(lemmatize)
X_test['lemma'] = X_test['review'].map(lemmatize)

In [None]:
X_train['review_no_stop_words'] = X_train['review'].map(remove_stop_words)
X_test['review_no_stop_words'] = X_test['review'].map(remove_stop_words)

In [None]:
X_train['pos'] = X_train['review'].map(part_of_speech)
X_test['pos'] = X_test['review'].map(part_of_speech)

In [None]:
X_train['sent_length'] = X_train['review'].map(measure_sentence_length)
X_test['sent_length'] = X_test['review'].map(measure_sentence_length)

In [None]:
X_train['entities'] = X_train['review'].map(extract_named_entities)
X_test['entities'] = X_test['review'].map(extract_named_entities)

## Term frequency (Unigram)

In [None]:
tfidf = TfidfVectorizer(use_idf=False)
tfidf.fit(X_train['pos'])

In [None]:
tf_pos_train = tfidf.transform(X_train['pos'])
tf_pos_test = tfidf.transform(X_test['pos'])

In [None]:
print(len(tf_pos_train.todense()))

169


In [None]:
pos_df_train = pd.DataFrame(tf_pos_train.todense(), columns=tfidf.get_feature_names_out())
pos_tf_test = pd.DataFrame(tf_pos_test.todense(), columns=tfidf.get_feature_names_out())

In [None]:
pos_tf_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   adj     43 non-null     float64
 1   adp     43 non-null     float64
 2   adv     43 non-null     float64
 3   aux     43 non-null     float64
 4   cconj   43 non-null     float64
 5   det     43 non-null     float64
 6   intj    43 non-null     float64
 7   noun    43 non-null     float64
 8   num     43 non-null     float64
 9   part    43 non-null     float64
 10  pron    43 non-null     float64
 11  propn   43 non-null     float64
 12  punct   43 non-null     float64
 13  sconj   43 non-null     float64
 14  space   43 non-null     float64
 15  sym     43 non-null     float64
 16  verb    43 non-null     float64
dtypes: float64(17)
memory usage: 5.8 KB


In [None]:
X_train = pd.concat([X_train, pos_df_train], axis=1)
X_test = pd.concat([X_test, pos_tf_test], axis=1)

## Term frequency (Bigram)

In [None]:
tfidf = TfidfVectorizer(use_idf=False, ngram_range=(2, 2), max_features = 30)
tfidf.fit(X_train['pos'])

In [None]:
tf_pos_train = tfidf.transform(X_train['pos'])
tf_pos_test = tfidf.transform(X_test['pos'])

In [None]:
print(len(tf_pos_train.todense()))

169


In [None]:
pos_df_train = pd.DataFrame(tf_pos_train.todense(), columns=tfidf.get_feature_names_out())
pos_tf_test = pd.DataFrame(tf_pos_test.todense(), columns=tfidf.get_feature_names_out())

In [None]:
pos_tf_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 30 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   adj adp      43 non-null     float64
 1   adj noun     43 non-null     float64
 2   adj punct    43 non-null     float64
 3   adp adj      43 non-null     float64
 4   adp noun     43 non-null     float64
 5   adv adj      43 non-null     float64
 6   det noun     43 non-null     float64
 7   noun adj     43 non-null     float64
 8   noun adp     43 non-null     float64
 9   noun cconj   43 non-null     float64
 10  noun noun    43 non-null     float64
 11  noun punct   43 non-null     float64
 12  noun verb    43 non-null     float64
 13  part verb    43 non-null     float64
 14  pron adp     43 non-null     float64
 15  propn propn  43 non-null     float64
 16  propn punct  43 non-null     float64
 17  punct adj    43 non-null     float64
 18  punct adp    43 non-null     float64
 19  punct ccon

In [None]:
X_train = pd.concat([X_train, pos_df_train], axis=1)
X_test = pd.concat([X_test, pos_tf_test], axis=1)

## Term frequency (Trigram)

In [None]:
tfidf = TfidfVectorizer(use_idf=False, ngram_range=(3, 3), max_features = 30)
tfidf.fit(X_train['pos'])

In [None]:
tf_pos_train = tfidf.transform(X_train['pos'])
tf_pos_test = tfidf.transform(X_test['pos'])

In [None]:
print(len(tf_pos_train.todense()))

169


In [None]:
pos_df_train = pd.DataFrame(tf_pos_train.todense(), columns=tfidf.get_feature_names_out())
pos_tf_test = pd.DataFrame(tf_pos_test.todense(), columns=tfidf.get_feature_names_out())

In [None]:
pos_tf_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adj adp noun       43 non-null     float64
 1   adj noun adj       43 non-null     float64
 2   adj noun adp       43 non-null     float64
 3   adj noun noun      43 non-null     float64
 4   adj noun punct     43 non-null     float64
 5   adp adj noun       43 non-null     float64
 6   adp det noun       43 non-null     float64
 7   adp noun adj       43 non-null     float64
 8   adp noun adp       43 non-null     float64
 9   adp noun noun      43 non-null     float64
 10  adp noun punct     43 non-null     float64
 11  adv adj noun       43 non-null     float64
 12  det noun punct     43 non-null     float64
 13  noun adj noun      43 non-null     float64
 14  noun adj punct     43 non-null     float64
 15  noun adp adj       43 non-null     float64
 16  noun adp noun      43 non-nu

In [None]:
X_train = pd.concat([X_train, pos_df_train], axis=1)
X_test = pd.concat([X_test, pos_tf_test], axis=1)

In [None]:
X_train.sample()

Unnamed: 0,review,lemma,review_no_stop_words,pos,sent_length,entities,adj,adp,adv,aux,...,noun punct noun,noun punct sconj,noun punct space,noun punct verb,noun verb pron,propn propn punct,punct adj noun,punct propn propn,verb adp noun,verb pron adp
5,Dragon Ball Z: Kakarot to gra dla fanów wciąż ...,Dragon Ball Z : Kakarot to grać dla fan wciąż ...,Dragon Ball : Kakarot gra fanów wciąż przepełn...,PROPN PROPN X PUNCT PROPN AUX VERB ADP NOUN AD...,17.416667,20,0.309776,0.285947,0.163824,0.077444,...,0.115841,0.115841,0.162178,0.208514,0.115841,0.254851,0.162178,0.208514,0.208514,0.162178


#Model based on extracted features

In [None]:
X_train_num_features = X_train.select_dtypes(include=['int', 'float'])
X_test_num_features = X_test.select_dtypes(include=['int', 'float'])

In [None]:
scaler = MinMaxScaler()
X_normalized_train = scaler.fit_transform(X_train_num_features)
X_normalized_test = scaler.transform(X_test_num_features)

In [None]:
nb = MultinomialNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}

grid_search =  GridSearchCV(nb, param_grid, cv=10)
grid_search.fit(X_normalized_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_normalized_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best Hyperparameters: {'alpha': 5.0}
Accuracy: 0.7441860465116279
                                 precision    recall  f1-score   support

            Dariusz DM Matusiak       0.82      0.93      0.87        15
Michał Czarny Wilk Grygorcewicz       0.70      0.54      0.61        13
             Przemysław Zamęcki       0.69      0.73      0.71        15

                       accuracy                           0.74        43
                      macro avg       0.74      0.74      0.73        43
                   weighted avg       0.74      0.74      0.74        43



In [None]:
import warnings
from sklearn.utils.validation import DataConversionWarning

# Suppress the warning
warnings.filterwarnings("ignore", category=DataConversionWarning)

In [None]:
pipeline = Pipeline([
    ('clf', AdaBoostClassifier(RandomForestClassifier()))
])

param_grid = {
    'clf__n_estimators': [800, 1200],
    'clf__estimator__min_samples_leaf': [16, 32],
    'clf__learning_rate': [0.001, 0.01],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10)
grid_search.fit(X_train_num_features, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_test_num_features)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
pickle.dump(grid_search.best_estimator_, open('model.pkl', 'wb'))

#First model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df_top3['author'].value_counts()

Przemysław Zamęcki                 158
Dariusz DM Matusiak                 91
Michał Czarny Wilk Grygorcewicz     82
Name: author, dtype: int64

In [None]:
X = df_top3['review']
y = df_top3['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
tfidf = TfidfVectorizer(use_idf=False)
tfidf.fit(X_train)

In [None]:
X_train_tf = tfidf.transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [None]:
tfidf.get_feature_names_out()

array(['_the', 'ale', 'bardziej', 'bardzo', 'bazie', 'bez', 'bo', 'by',
       'być', 'było', 'choć', 'co', 'czas', 'czasu', 'czy', 'części',
       'dla', 'do', 'dobrze', 'dość', 'gdy', 'gier', 'gra', 'gry', 'grze',
       'grę', 'ich', 'jak', 'jako', 'jednak', 'jego', 'jej', 'jest',
       'jeszcze', 'jeśli', 'już', 'kiedy', 'kilka', 'która', 'które',
       'której', 'który', 'których', 'lat', 'lub', 'ma', 'mi', 'może',
       'można', 'na', 'nam', 'nas', 'nawet', 'nie', 'nieco', 'niż', 'od',
       'of', 'oraz', 'po', 'pod', 'postaci', 'powstała', 'przed', 'przez',
       'przy', 'recenzja', 'roku', 'rozgrywki', 'również', 'serii', 'się',
       'sobie', 'sposób', 'są', 'tak', 'także', 'tego', 'tej', 'ten',
       'też', 'the', 'to', 'trochę', 'tu', 'twórcy', 'tych', 'tylko',
       'tym', 'wersji', 'wiele', 'więc', 'więcej', 'wszystko', 'właśnie',
       'za', 'zabawy', 'ze', 'zupełnie', 'że'], dtype=object)

In [None]:
print(X_train_tf.todense())

[[0.         0.09346771 0.0437479  ... 0.         0.         0.09000369]
 [0.14797406 0.06676108 0.04687164 ... 0.         0.02866521 0.22500393]
 [0.         0.07131365 0.05006791 ... 0.         0.         0.03433534]
 ...
 [0.17351237 0.1174247  0.         ... 0.10341003 0.         0.1130728 ]
 [0.         0.08426649 0.03944123 ... 0.03710463 0.         0.18933477]
 [0.         0.1228397  0.         ... 0.04056703 0.         0.14785889]]


In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', AdaBoostClassifier(RandomForestClassifier()))
])

param_grid = {
    'tfidf__max_df': [0.5, 1.0],
    'clf__n_estimators': [200, 400],
    'clf__estimator__max_depth': [5, None],
    'clf__estimator__min_samples_leaf': [2, 4, 6],
    'clf__learning_rate': [0.01, 0.1, 1.0],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
nb = MultinomialNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}

grid_search =  GridSearchCV(nb, param_grid, cv=10)
grid_search.fit(X_train_tf, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_test_tf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best Hyperparameters: {'alpha': 0.1}
Accuracy: 0.7164179104477612
                                 precision    recall  f1-score   support

            Dariusz DM Matusiak       1.00      0.67      0.80        18
Michał Czarny Wilk Grygorcewicz       1.00      0.24      0.38        17
             Przemysław Zamęcki       0.63      1.00      0.77        32

                       accuracy                           0.72        67
                      macro avg       0.88      0.63      0.65        67
                   weighted avg       0.82      0.72      0.68        67



#Second model

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python3 -m spacy download pl_core_news_lg

In [None]:
import spacy
import pl_core_news_lg

In [None]:
nlp = spacy.load('pl_core_news_lg')

In [None]:
def lemmatize(sentence):
  return ' '.join([token.lemma_ for token in nlp(sentence)])

In [None]:
df_top3['lemma'] = df_top3['review'].map(lemmatize)

In [None]:
print(df_top3['lemma'].to_list()[5])

The Callisto Protocol zapewne móc by być kolejny rozdział seria Dead Space . zamiast to być on godny duchowy spadkobierca – jeszcze bardzo krwawy , brutalny i mroczny . to jeden z bardzo klimatyczny giera ostatni rok .   recenzja powstać na baza wersja .   „ Pamiętajcie , by robić siebie przerwa , odejść od monitor i złapać oddech ” – tak brzmieć jeden z rada twórca _ The Callisto Protocol _ dla gracz . ale nawet bez taki wskazówka podświadomie odbywać być niezbyt długi sesja z ten gra . i nie dlatego , że być nużący czy nie wciągać , tylko przez niezwykle przytłaczający , klaustrofobiczny klimat survival horror , potęgować jeszcze arcybrutalny widok typ gore . serio , _ Sniper Elite _ z swój rentgenowski killcam wydawać się gra familijny - edukacyjny przy scena śmierć główny bohater w _ The Callisto Protocol _ .   jeśli tylko zadbać się o odpowiedni warunki , czyli grać w ciemny pokój z słuchawki na ucho , można liczyć na atmosfera , który dać się kroić nóż , na ciągły napięcie bez ab

In [None]:
X = df_top3['lemma_no_stop_words']
y = df_top3['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train)

In [None]:
X_train_tf = tfidf.transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', AdaBoostClassifier(RandomForestClassifier()))
])

param_grid = {
    'tfidf__max_df': [0.5, 1.0],
    'clf__n_estimators': [200, 400],
    'clf__estimator__max_depth': [5, None],
    'clf__estimator__min_samples_leaf': [4, 6],
    'clf__learning_rate': [0.1, 1.0],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=10)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
nb = MultinomialNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}

grid_search =  GridSearchCV(nb, param_grid, cv=10)
grid_search.fit(X_train_tf, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_test_tf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best Hyperparameters: {'alpha': 0.1}
Accuracy: 0.8656716417910447
                                 precision    recall  f1-score   support

            Dariusz DM Matusiak       1.00      0.78      0.88        18
Michał Czarny Wilk Grygorcewicz       1.00      0.71      0.83        17
             Przemysław Zamęcki       0.78      1.00      0.88        32

                       accuracy                           0.87        67
                      macro avg       0.93      0.83      0.86        67
                   weighted avg       0.90      0.87      0.86        67



In [None]:
for review in df_top3['review'].to_list():
  print(len(review))

#Article to Matrix

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Sample DataFrame with articles
df = pd.DataFrame({'article': ['This is the first sentence. This is the second sentence.',
                               'Another article with multiple sentences. Sentence three.']})



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Assuming you have a DataFrame called 'df' with 'author' and 'article' columns

# Splitting the article into sentences
sentences = []
for article in df['article']:
    # Assuming the sentences are separated by periods ('.')
    sentences.extend(article.split('.'))
sentences = [s.strip() for s in sentences if s.strip()]

# Creating a CountVectorizer instance
vectorizer = CountVectorizer()

# Fitting and transforming the sentences to obtain the term frequency matrix
tf_matrix = vectorizer.fit_transform(sentences)

# Creating a DataFrame from the tf_matrix
matrix_df = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Printing the resulting matrix DataFrame
print(matrix_df)

   another  article  first  is  multiple  second  sentence  sentences  the  \
0        0        0      1   1         0       0         1          0    1   
1        0        0      0   1         0       1         1          0    1   
2        1        1      0   0         1       0         0          1    0   
3        0        0      0   0         0       0         1          0    0   

   this  three  with  
0     1      0     0  
1     1      0     0  
2     0      0     1  
3     0      1     0  


In [None]:
unique_words = set()

# Iterate over each article
for article in df_top3['review']:
    # Process the article using spaCy
    doc = nlp(article)

    # Extract unique words from the article, excluding stop words
    words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]

    # Update the set of unique words
    unique_words.update(words)

# Print the set of unique words
print(unique_words.sort())

{'obserwować być', 'kolonizator', 'Debil', 'kasa', 'wypunktować', 'gorszy', 'Aloy', 'ratuć', 'narastać', 'fuszerka', 'pozostały', 'wypierać', 'obserwacyjny', 'wyczerpuć', 'Zapożyczając', '/galeria', 'wraże', 'wbić', 'niedźwiedź', 'węgieł', 'schyłek', 'chcący', 'zapuszczyć', 'opus', 'wymachujących', 'Nomad', 'szermierka', 'stworzyć', 'jesteście', 'podobać', 'pokolen', 'intrygoweć', 'popołudnie', 'Arcadia', 'remake', 'targets', 'Blind', 'nawiązać', 'tyczka', 'raytracingowy', 'nieznany', 'kucać', 'Jerycho', 'wyciągać', 'przykrywka', 'znalezione', 'Magik', 'remasterować', 'kompilacja', 'Sapienza', 'uszyć', 'Kotak', 'Watchmen', 'lunaparek', 'szybko', 'zwierzęcie', 'ankieta', 'niezmiernie', 'łącze', 'Cage', 'kompleksowość', 'manewrowy', 'zadowalający', 'mamić', 'najładniejszego', 'docenić być', 'autorski', 'komplikować', 'głupa', 'Normandia', 'chór', 'zgodny', 'męczyć być', 'Numener', 'ominąć', 'zima', 'pływać', 'Shockowy', 'Inny', 'zasłonić', 'zakładać', 'nieposłuszny', 'pobudka', 'podbramk

In [None]:
print(sorted(unique_words))

['+', '/galeria', '0', '000', '01', '02', '04', '1', '1,3', '1/3', '1/4', '10', '10/10', '100', '1000', '100cc', '101', '102', '105', '10600', '1080p', '11', '117', '12', '13', '130', '14', '1403', '141', '144', '1440p', '15', '150', '16', '160', '1606', '16:9', '17', '18', '180', '186', '19', '1914', '1918', '1920x1080', '1943', '1944', '1945', '1956', '1958', '1964', '1975', '1976', '1977', '1979', '1980', '1982', '1983', '1984', '1986', '1987', '1988', '1989', '1992', '1993', '1994', '1995', '1996', '1998', '1999', '1:1', '1\\', '2', '2,5D', '2,9', '20', '200', '2000', '2001', '2002', '2004', '2005', '2006', '2008', '2009', '200cc', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '202', '2020', '2022', '2023', '2038', '2044', '2077', '2084', '21', '21:9', '22', '221', '221b', '22474487139', '24', '25', '256', '2560x1080', '26', '27', '270', '28', '2D', '2\\', '3', '3,5', '30', '300', '3060', '32', '320', '34', '35', '356', '358/2', '360', '386', '3D',