In [None]:
import pandas as pd
import numpy as np

## Wczytanie słownika i wyfiltrowanie interesujących danych

In [None]:
df = pd.read_csv('./plwordnet_4_2/słownik_anotacji_emocjonlanej.csv')

In [None]:
nacechowania = pd.DataFrame(df.stopien_nacechowania).drop_duplicates()['stopien_nacechowania'].tolist()
nacechowania.remove(np.nan)
nacechowania

In [None]:
df = df[df.stopien_nacechowania.isin(nacechowania)]
df

In [None]:
word_sentiment_df = df[['lemat','stopien_nacechowania']].drop_duplicates()
word_sentiment_df

In [None]:
word_sentiment_dict = {}
for index, row in word_sentiment_df.iterrows():
    word_sentiment_dict[row['lemat']] = row['stopien_nacechowania']

## Wczytanie tweetów do analizy

In [None]:
# pomocnicze funkcje

def flatten_dct_(prefix, dct):
    res = {}
    for key, value in dct.items():
        if not isinstance(value, dict):
            res[prefix + key] = value
        else:
            for key2, value2 in flatten_dct_(key+".", value).items():
                res[prefix+key2] = value2
    return res

def flatten_dct(dct):
    return flatten_dct_('', dct)



#### Wczytanie tweetów do pd.DataFrame

In [None]:
import pymongo

DB_CONN_STRING = "mongodb://localhost:27017/"
DB_NAME = "recent-poland-covid-vaccine-tweets"
COLLECTION_NAME = "tweets"

mongo_client = pymongo.MongoClient(DB_CONN_STRING)
database_name = mongo_client[DB_NAME]
data_collection = database_name[COLLECTION_NAME]


query = {'sentiment':{ '$exists': False}}
projection = {"date":1, "content":1, "user.username":1, "user.id":1, "user.followersCount":1, "user.verified":1}
tweets = [flatten_dct(tweet) for tweet in data_collection.find(query, projection)]
# print(tweets[0])

df = pd.DataFrame(tweets)
df

#### Wyczyszczenie tweetów z hashtagów, cytowań, linków... 

In [None]:
import re

def clean_up_tweet(tweet):
    tweet = ' ' + tweet + ' ' #added to simplify regex below
    tweet = re.sub(r' [$#@][^ ]* | http[^ ]* ', ' ', tweet)[1:-2]
    return re.sub(r'[^\w ]', '', tweet).lower()

df['content_cleaned'] = [clean_up_tweet(tw) for tw in df['content'].values]

#### Sprowadzenie słów do podstawowej formy gramatycznej

In [None]:
from stempel import StempelStemmer
stemmer = StempelStemmer.default()

In [None]:
def to_base_grammar_form(tweet):
    result_tweet = ''
    for word in tweet.split(' '):
        if len(word) > 0:
            try:
                result_tweet += ' ' + stemmer.stem(word)
            except TypeError as e:
                result_tweet += ' ' + word
    return result_tweet[1:]

df['content_cleaned'] = [to_base_grammar_form(tw) for tw in df['content_cleaned'].values]

## Analiza sentymentu

In [None]:
def get_sentiment(tweet):
    sentiment = 0
    f = {
        '- s' : -0.5,
        '- s ': -0.5,
        '- m' : - 1,
        '+ s' : 0.5,
        '+ m' : 1,
        'amb' : 0
    }
    for word in tweet.split(' '):
        try:
            sentiment += f[word_sentiment_dict[word]]
#             print("Word: {} Elementar sentiment: {}".format(word, f[word_sentiment_dict[word]]))
        except KeyError:
            pass
        
    return sentiment

In [None]:
df['sentiment'] = [get_sentiment(tw) for tw in df['content_cleaned'].values]

In [None]:
# tw_idx = 370038
# print(df['content'][tw_idx] + "  --> " + str(get_sentiment(df['content_cleaned'][tw_idx])))

## Zapisanie sentymentu w bazie danych

In [None]:
for index, row in df.iterrows():
    myquery = { "_id": int(row['_id']) }
    newvalues = { "$set": { "sentiment": "{}".format(row['sentiment']) } }

    data_collection.find_one_and_update(myquery, newvalues)