In [None]:
%run ./modules/load_data_module.ipynb
%run ./modules/extract_info_clean_data_module.ipynb
%run ./modules/update_dataframe_module.ipynb
%run ./modules/sentiment_module.ipynb
%run ./modules/semantic_similarity_module.ipynb

In [None]:
import pandas as pd
import spacy
import string
import regex as re
import nltk
import ast
import copy
import glob
import advertools as adv
import plotly.graph_objects as go
import time
from datetime import datetime

pd.options.display.max_colwidth = 285

from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from spacymoji import Emoji
from nltk.corpus import stopwords
from urllib.parse import urlparse
from textblob import Word
from sklearn.metrics.pairwise import cosine_similarity


nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp)
nlp.add_pipe("emoji", first=True)


stemmer = nltk.SnowballStemmer("english")


In [None]:
user_dtype = {
    "id": "Int64", 
    "username": "string", 
    "created_at": "string", 
    "name": "string", 
    "protected": "boolean", 
    "verified": "boolean", 
    "followers_count": "Int64", 
    "following_count": "Int64", 
    "tweet_count": "Int64", 
    "listed_count": "Int64", 
    "url": "string", 
    "description": "string"
}


tweet_dtype = {
    "id": "Int64",
    "author_id": "Int64",
    "conversation_id": "Int64",
    "created_at": "string",
    "lang": "string",
    "retweet_count": "Int64",
    "reply_count": "Int64",
    "like_count": "Int64",
    "quote_count": "Int64",
    "reply_settings": "string",
    "text": "string"
}

farnaz_dtype = {
    'id':'Int64',
    'author_id':'Int64',
    'conversation_id' :'Int64',
    'created_at': 'string',
    'lang': 'string',
    'retweet_count':'Int64',
    'reply_count':'Int64',
    'like_count':'Int64',
    'quote_count':'Int64',
    'reply_settings':  'string',
    'text': 'string',
    'hashtags': 'string',
    'hashtag_count':'Int64',
    'uppercase_count':'Int64',
    'uppercase_pct':'Int64',
    'exclamation_mark_count':'Int64',
    'question_mark_count':'Int64',
    'url_count':'Int64',
    'mention_count':'Int64',
    'emojie_count':'Int64',
    'followers_count':'float64',
    'engagement_score':'float64',
    'sentiment_score':'float64',
    'sentiment_category': 'string',
    'avg_replies_sentiment':'float64',
    'avg_quotes_sentiment' :'float64',
    'avg_conversation_sentiment': 'float64',
    'var_replies_sentiment':'float64',
    'var_quotes_sentiment':'float64',
    'var_conversation_sentiment':'float64',
    'pct_similar_sentiment_replies':'float64',
    'pct_similar_sentiment_quotes':'float64',
    'clean_text':'string',
    'var_replies_semantic':'float64',
    'var_quotes_semantic':'float64',
    'var_conversation_semantic':'float64',
    'GS_controversiality_score':'float64'
}


niloo_corran_dtype = {
    'index':'Int64',
    'id':'Int64',
    'author_id':'Int64',
    'conversation_id' :'Int64',
    'created_at': 'string',
    'lang': 'string',
    'retweet_count':'Int64',
    'reply_count':'Int64',
    'like_count':'Int64',
    'quote_count':'Int64',
    'reply_settings':  'string',
    'text': 'string',
    'hashtags': 'string',
    'hashtag_count':'Int64',
    'uppercase_count':'Int64',
    'uppercase_pct':'Int64',
    'exclamation_mark_count':'Int64',
    'question_mark_count':'Int64',
    'url_count':'Int64',
    'mention_count':'Int64',
    'emojie_count':'Int64',
    'followers_count':'float64',
    'engagement_score':'float64',
    'sentiment_score':'float64',
    'sentiment_category': 'string',
    'avg_replies_sentiment':'float64',
    'avg_quotes_sentiment' :'float64',
    'avg_conversation_sentiment': 'float64',
    'var_replies_sentiment':'float64',
    'var_quotes_sentiment':'float64',
    'var_conversation_sentiment':'float64',
    'pct_similar_sentiment_replies':'float64',
    'pct_similar_sentiment_quotes':'float64',
    'clean_text':'string',
    'var_replies_semantic':'float64',
    'var_quotes_semantic':'float64',
    'var_conversation_semantic':'float64',
    'GS_controversiality_score':'float64'

}

conversation_dtype = {
    "id": "Int64",
    "author_id": "Int64",
    "conversation_id": "Int64",
    "created_at": "string",
    "lang": "string",
    "retweet_count": "Int64",
    "reply_count": "Int64",
    "like_count": "Int64",
    "quote_count": "Int64",
    "referenced_tweets": "string",
    "text": "string"
}

conversation_dtype_ref_tweet = {
    "id": "Int64",
    "author_id": "Int64",
    "conversation_id": "Int64",
    "created_at": "string",
    "lang": "string",
    "retweet_count": "Int64",
    "reply_count": "Int64",
    "like_count": "Int64",
    "quote_count": "Int64",
    "referenced_tweets": "string",
    "text": "string",
    "reference_tweet_id" : "Int64"
}


In [None]:
EXTEXDED_COLS = ['conversation_id', 'Segment', 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'BigWords', 'Dic',
 'Linguistic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
 'ipron', 'det', 'article', 'number', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb',
 'adj', 'quantity', 'Drives', 'affiliation', 'achieve', 'power', 'Cognition', 'allnone', 'cogproc', 'insight',
 'cause', 'discrep', 'tentat', 'certitude', 'differ', 'memory', 'Affect', 'tone_pos', 'tone_neg', 'emotion',
 'emo_pos', 'emo_neg', 'emo_anx', 'emo_anger', 'emo_sad', 'swear', 'Social', 'socbehav', 'prosocial', 'polite',
 'conflict', 'moral', 'comm', 'socrefs', 'family', 'friend', 'female', 'male', 'Culture', 'politic',
 'ethnicity', 'tech', 'Lifestyle', 'leisure', 'home', 'work', 'money', 'relig', 'Physical', 'health',
 'illness', 'wellness', 'mental', 'substances', 'sexual', 'food', 'death', 'need', 'want', 'acquire',
 'lack', 'fulfill', 'fatigue', 'reward', 'risk', 'curiosity', 'allure', 'Perception', 'attention', 'motion',
 'space', 'visual', 'auditory', 'feeling', 'time', 'focuspast', 'focuspresent', 'focusfuture', 'Conversation', 'netspeak',
'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'QMark', 'Exclam', 'Apostro', 'OtherP']

In [None]:
EXTENDED_TWEETS = pd.read_csv('../data/processed-data/tweets_15July_5Aug/all_tweets_extra_features.csv')

# initial dataframe cleaning

In [None]:
# raw_tweets_path = glob.glob('../data/data/*tweets.csv')
raw_users_path = glob.glob('../data/data/*users.csv')
# raw_replies_path = glob.glob('../data/data/*replies.csv')
# raw_quotes_path = glob.glob('../data/data/*quotes.csv')

In [None]:
# raw_all_tweets = get_df(raw_tweets_path, tweet_dtype)
# raw_all_quotes = get_df(raw_quotes_path, conversation_dtype).dropna()
# raw_all_replies = get_df(raw_replies_path, conversation_dtype)

In [None]:
raw_all_users = get_df(raw_users_path, user_dtype)
all_users = clean_users(raw_all_users)

In [None]:
# raw_all_replies['reference_tweet_id'] = add_reference_id(raw_all_replies)
# raw_all_quotes['reference_tweet_id'] = add_reference_id(raw_all_quotes)

### load all processed data

In [None]:
processed_tweets = get_df_from_feather(glob.glob('../data/processed-data/all_tweets_v2/*.feather'))
processed_replies = get_df_from_feather(glob.glob('../data/processed-data/all_replies/*.feather'))
processed_quotes = get_df_from_feather(glob.glob('../data/processed-data/all_quotes/*.feather'))

In [None]:
display(processed_tweets.loc[processed_tweets['conversation_id']==1553799587541942278][['var_conversation_semantic', 'var_conversation_sentiment']])
display(processed_tweets.loc[processed_tweets['conversation_id']==1553799587541942278][['var_replies_semantic', 'var_replies_sentiment']])
display(processed_tweets.loc[processed_tweets['conversation_id']==1553799587541942278][['var_quotes_semantic', 'var_quotes_sentiment']])

### Processed data 15 july 5 Aug

In [None]:
processed_tweets15 = get_df_from_feather(glob.glob('../data/processed-data/tweets_15July_5Aug_v2/*.feather'))
processed_replies15 = get_df_from_feather(glob.glob('../data/processed-data/replies_15July_5Aug/*.feather'))
processed_quotes15 = get_df_from_feather(glob.glob('../data/processed-data/quotes_15July_5Aug/*.feather'))

In [None]:
display(processed_tweets15.loc[processed_tweets15['conversation_id']==1553799587541942278][['var_conversation_semantic', 'var_conversation_sentiment']])
display(processed_tweets15.loc[processed_tweets15['conversation_id']==1553799587541942278][['var_replies_semantic', 'var_replies_sentiment']])
display(processed_tweets15.loc[processed_tweets15['conversation_id']==1553799587541942278][['var_quotes_semantic', 'var_quotes_sentiment']])

## load raw sample data

In [None]:
raw_sample_tweets = pd.read_feather('../data/sample_data/raw-samples/sample_tweets.feather')
raw_sample_replies = pd.read_feather('../data/sample_data/raw-samples/sample_replies.feather')
raw_sample_quotes = pd.read_feather('../data/sample_data/raw-samples/sample_quotes.feather')

### Load sample_data unscored

In [None]:
unscored_sample_tweets = pd.read_feather('../data/sample_data/sample_data_unscored_v2/sample_tweets.feather')
unscored_sample_replies = pd.read_feather('../data/sample_data/sample_data_unscored/sample_replies.feather')
unscored_sample_quotes = pd.read_feather('../data/sample_data/sample_data_unscored/sample_quotes.feather')

In [None]:
display(unscored_sample_tweets.loc[unscored_sample_tweets['conversation_id']==1553799587541942278][['var_conversation_semantic', 'var_conversation_sentiment']])
display(unscored_sample_tweets.loc[unscored_sample_tweets['conversation_id']==1553799587541942278][['var_replies_semantic', 'var_replies_sentiment']])
display(unscored_sample_tweets.loc[unscored_sample_tweets['conversation_id']==1553799587541942278][['var_quotes_semantic', 'var_quotes_sentiment']])

## Load sample data 15 July - 5 Aug.

In [None]:
unscored_sample_tweets_3fw = pd.read_feather('../data/sample_data/sample_data_unscored_15July_5Aug_v2/sample_tweets.feather')
unscored_sample_replies_3fw = pd.read_feather('../data/sample_data/sample_data_unscored_15July_5Aug/sample_replies_15July_5Aug.feather')
unscored_sample_quotes_3fw= pd.read_feather('../data/sample_data/sample_data_unscored_15July_5Aug/sample_quotes_15July_5Aug.feather')

In [None]:
display(unscored_sample_tweets_3fw.loc[unscored_sample_tweets_3fw['conversation_id']==1553799587541942278][['var_conversation_semantic', 'var_conversation_sentiment']])
display(unscored_sample_tweets_3fw.loc[unscored_sample_tweets_3fw['conversation_id']==1553799587541942278][['var_replies_semantic', 'var_replies_sentiment']])
display(unscored_sample_tweets_3fw.loc[unscored_sample_tweets_3fw['conversation_id']==1553799587541942278][['var_quotes_semantic', 'var_quotes_sentiment']])

### Farnaz

In [None]:
sample_tweets_farnaz = pd.read_feather(f'../data/sample_data/sample_tweets_farnaz.feather')

In [None]:
display(sample_tweets_farnaz.loc[sample_tweets_farnaz['conversation_id']==1549869265901199360][['var_conversation_semantic', 'var_conversation_sentiment']])
display(sample_tweets_farnaz.loc[sample_tweets_farnaz['conversation_id']==1549869265901199360][['var_replies_semantic', 'var_replies_sentiment']])
display(sample_tweets_farnaz.loc[sample_tweets_farnaz['conversation_id']==1549869265901199360][['var_quotes_semantic', 'var_quotes_sentiment']])

### farnaz first 3 weeks

In [None]:
sample_tweets_farnaz_f3w = get_in_between(sample_tweets_farnaz, datetime(2022, 7, 15), datetime(2022, 8, 6))

In [None]:
display(sample_tweets_farnaz_f3w.loc[sample_tweets_farnaz_f3w['conversation_id']==1549869265901199360][['var_conversation_semantic', 'var_conversation_sentiment']])
display(sample_tweets_farnaz_f3w.loc[sample_tweets_farnaz_f3w['conversation_id']==1549869265901199360][['var_replies_semantic', 'var_replies_sentiment']])
display(sample_tweets_farnaz_f3w.loc[sample_tweets_farnaz_f3w['conversation_id']==1549869265901199360][['var_quotes_semantic', 'var_quotes_sentiment']])

### Niloo

In [None]:
sample_tweets_niloo_f3w = pd.read_feather(f'../data/sample_data/sample_tweets_niloo_first3weeks.feather')

In [None]:
display(sample_tweets_niloo_f3w.loc[sample_tweets_niloo_f3w['conversation_id']==1549869265901199360][['var_conversation_semantic', 'var_conversation_sentiment']])
display(sample_tweets_niloo_f3w.loc[sample_tweets_niloo_f3w['conversation_id']==1549869265901199360][['var_replies_semantic', 'var_replies_sentiment']])
display(sample_tweets_niloo_f3w.loc[sample_tweets_niloo_f3w['conversation_id']==1549869265901199360][['var_quotes_semantic', 'var_quotes_sentiment']])

In [None]:
sample_tweets_niloo = pd.read_feather(f'../data/sample_data/sample_tweets_niloo.feather')

In [None]:
display(sample_tweets_niloo.loc[sample_tweets_niloo['conversation_id']==1549869265901199360][['var_conversation_semantic', 'var_conversation_sentiment']])
display(sample_tweets_niloo.loc[sample_tweets_niloo['conversation_id']==1549869265901199360][['var_replies_semantic', 'var_replies_sentiment']])
display(sample_tweets_niloo.loc[sample_tweets_niloo['conversation_id']==1549869265901199360][['var_quotes_semantic', 'var_quotes_sentiment']])

### Corran

In [None]:
# sample_corran_500 = get_df(['../data/sample_data/first_500_sample_scored.csv'], niloo_corran_dtype)
sample_corran_500 = pd.read_csv('../data/sample_data/first_500_sample_scored.csv').dropna()

In [None]:
sample_corran_niloo = pd.concat([sample_niloo_500, sample_corran_500])

## Add extra features to sample

In [None]:
sample_tweets_farnaz_extended_features = add_extended_features(get_df(['../data/sample_data/sample_retweets_farnaz.csv'], tweet_dtype))

In [None]:
sample_tweets_niloo_first3weeks_extended_features = add_extended_features(get_df(['../data/sample_data/sample_tweets_niloo_first3weeks.csv'], tweet_dtype))

In [None]:
sample_tweets_niloo_extended_features = add_extended_features(get_df(['../data/sample_data/sample_tweets_niloo.csv'], tweet_dtype))

In [None]:
first_500_sample_scored_extended_features = add_extended_features(get_df(['../data/sample_data/first_500_sample_scored.csv'], tweet_dtype))