In [None]:
%run ./modules/load_data_module.ipynb
%run ./modules/extract_info_clean_data_module.ipynb
%run ./modules/update_dataframe_module.ipynb
%run ./modules/sentiment_module.ipynb
%run ./modules/semantic_similarity_module.ipynb

In [None]:
import pandas as pd
import spacy
import string
import regex as re
import nltk
import ast
import copy
import glob
import advertools as adv
import plotly.graph_objects as go


pd.options.display.max_colwidth = 285



from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


from spacymoji import Emoji
from nltk.corpus import stopwords
from urllib.parse import urlparse
from textblob import Word
from sklearn.metrics.pairwise import cosine_similarity



nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp)
nlp.add_pipe("emoji", first=True)


stemmer = nltk.SnowballStemmer("english")


# nltk.download('wordnet')
# nltk.download('omw-1.4')


In [None]:
tweets_path = glob.glob('/home/niloofar/Developement/Climate-Change/data/data/*tweets.csv')
users_path = glob.glob('/home/niloofar/Developement/Climate-Change/data/data/*users.csv')
replies_path = glob.glob('/home/niloofar/Developement/Climate-Change/data/data/*replies.csv')
quotes_path = glob.glob('/home/niloofar/Developement/Climate-Change/data/data/*quotes.csv')

In [None]:
user_dtype = {
    "id": "Int64", 
    "username": "string", 
    "created_at": "string", 
    "name": "string", 
    "protected": "boolean", 
    "verified": "boolean", 
    "followers_count": "Int64", 
    "following_count": "Int64", 
    "tweet_count": "Int64", 
    "listed_count": "Int64", 
    "url": "string", 
    "description": "string"
}


tweet_dtype = {
    "id": "Int64",
    "author_id": "Int64",
    "conversation_id": "Int64",
    "created_at": "string",
    "lang": "string",
    "retweet_count": "Int64",
    "reply_count": "Int64",
    "like_count": "Int64",
    "quote_count": "Int64",
    "reply_settings": "string",
    "text": "string"
}


conversation_dtype = {
    "id": "Int64",
    "author_id": "Int64",
    "conversation_id": "Int64",
    "created_at": "string",
    "lang": "string",
    "retweet_count": "Int64",
    "reply_count": "Int64",
    "like_count": "Int64",
    "quote_count": "Int64",
    "referenced_tweets": "string",
    "text": "string"
}


In [None]:
load_tweets = get_df(tweets_path, tweet_dtype)
load_replies = get_df(replies_path, conversation_dtype)
load_quotes = get_df(quotes_path, conversation_dtype).dropna()
load_users = get_df(users_path, user_dtype)


# initial dataframe cleaning

In [None]:
users = clean_users(load_users)

load_replies['reference_tweet_id'] = add_reference_id(load_replies)
load_quotes['reference_tweet_id'] = add_reference_id(load_quotes)

### Generate sample data

In [None]:
# Run this cell to generate new sample data sets

# sample_tweets = get_sample_df(tweets_path, 0.01, tweet_dtype)
# sample_replies = get_sample_reply_quote(sample_tweets, load_replies)
# sample_quotes = get_sample_reply_quote(sample_tweets, load_quotes)

In [None]:
# sample_tweets.to_csv('tweets.csv', index=False)
# sample_replies.to_csv('replies.csv', index=False)
# sample_quotes.to_csv('quotes.csv', index=False)

In [None]:
sample_tweets = pd.read_csv('/home/niloofar/Developement/Climate-Change/data/sample_data/tweets.csv')
sample_replies = pd.read_csv('/home/niloofar/Developement/Climate-Change/data/sample_data/replies.csv')
sample_quotes = pd.read_csv('/home/niloofar/Developement/Climate-Change/data/sample_data/quotes.csv')

In [None]:
print(len(sample_tweets), len(sample_replies), len(sample_quotes))

### To load stored sample data

# GEN-DATA 

### Add list of hashtags found in tweet/reply/hashtag

In [None]:
sample_tweets['hashtags'] = add_hashtags(sample_tweets)

In [None]:
sample_replies['hashtags'] = add_hashtags(sample_replies)

In [None]:
sample_quotes['hashtags'] = add_hashtags(sample_quotes)

### Add count of hashtags in tweet/reply/quote

In [None]:
sample_tweets['hashtag_count'] = add_nbr_hashtags(sample_tweets)

In [None]:
sample_replies['hashtag_count'] = add_nbr_hashtags(sample_replies)

In [None]:
sample_quotes['hashtag_count'] = add_nbr_hashtags(sample_quotes)

### Add number of uppercase characters in tweet/reply/quote

In [None]:
sample_tweets['uppercase_count'] = add_uppercase_count(sample_tweets)

In [None]:
sample_replies['uppercase_count'] = add_uppercase_count(sample_replies)

In [None]:
sample_quotes['uppercase_count'] = add_uppercase_count(sample_quotes)

### Add percentage of uppercase characters in tweet/reply/quote

In [None]:
sample_tweets['uppercase_pct'] = add_upper_case_pct(sample_tweets)

In [None]:
sample_replies['uppercase_pct'] = add_upper_case_pct(sample_replies)

In [None]:
sample_quotes['uppercase_pct'] = add_upper_case_pct(sample_quotes)

### Add number of exclamation mark in tweet/reply/quote

In [None]:
sample_tweets['exclamation_mark_count'] = add_exclamation_mark_count(sample_tweets)

In [None]:
sample_replies['exclamation_mark_count'] = add_exclamation_mark_count(sample_replies)

In [None]:
sample_quotes['exclamation_mark_count'] = add_exclamation_mark_count(sample_quotes)

### Add number of question marks found in tweet/reply/quote

In [None]:
sample_tweets['question_mark_count'] =  add_question_mark_count(sample_tweets)

In [None]:
sample_replies['question_mark_count'] =  add_question_mark_count(sample_replies)

In [None]:
sample_quotes['question_mark_count'] =  add_question_mark_count(sample_quotes)

### Add number of URLs found in tweet/reply/quote

In [None]:
sample_tweets['url_count'] = add_url_count(sample_tweets)

In [None]:
sample_replies['url_count'] = add_url_count(sample_replies)

In [None]:
sample_quotes['url_count'] = add_url_count(sample_quotes)

### Add number of mentions in tweet/reply/quote

In [None]:
sample_tweets['mention_count'] = add_mention_count(sample_tweets)

In [None]:
sample_replies['mention_count'] = add_mention_count(sample_replies)

In [None]:
sample_quotes['mention_count'] = add_mention_count(sample_quotes)

### Add number of emojis found in tweet/reply/quote

In [None]:
sample_tweets['emojie_count'] = add_emoji_count(sample_tweets)

In [None]:
sample_replies['emojie_count'] = add_emoji_count(sample_replies)

In [None]:
sample_quotes['emojie_count'] = add_emoji_count(sample_quotes)

### Add tweet's author number of followers

In [None]:
sample_tweets['followers_count'] = add_followers_count(sample_tweets, users)

### Add engagement score

In [None]:
sample_tweets['engagement_score'] = add_engagement_score(sample_tweets)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
sample_tweets.head(3)

In [None]:
sample_replies.head(3)

In [None]:
sample_quotes.head(3)

## sentiment

### Add sentiment score to tweet/reply/quote

In [None]:
sample_tweets['sentiment_score'] = add_sentiment_score(sample_tweets)

In [None]:
sample_replies['sentiment_score'] = add_sentiment_score(sample_replies)

In [None]:
sample_quotes['sentiment_score'] = add_sentiment_score(sample_quotes)

### Add sentiment cateory (pos, neg, neu) to tweet/reply/quote

In [None]:
sample_tweets['sentiment_category'] = add_sentiment_category(sample_tweets)

In [None]:
sample_replies['sentiment_category'] = add_sentiment_category(sample_replies)

In [None]:
sample_quotes['sentiment_category'] = add_sentiment_category(sample_quotes)

### Add average sentiment of replies for each tweet

In [None]:
sample_tweets['avg_replies_sentiment'] = add_average_conversation_sentiment(sample_tweets, sample_replies)

In [None]:
sample_tweets['avg_quotes_sentiment'] = add_average_conversation_sentiment(sample_tweets, sample_quotes)

### Add average sentiment of replies + quotes of each tweet

In [None]:
sample_tweets['avg_conversation_sentiment'] = add_average_conversation_sentiment(sample_tweets, pd.concat([sample_replies, sample_quotes]))

### Add variance of replies/quotes/replies+quotes sentiment

In [None]:
sample_tweets['var_replies_sentiment'] = add_variance_of_conversation_sentiment(sample_tweets, sample_replies)

In [None]:
sample_tweets['var_quotes_sentiment'] = add_variance_of_conversation_sentiment(sample_tweets, sample_quotes)

In [None]:
sample_tweets['var_conversation_sentiment'] = add_variance_of_conversation_sentiment(sample_tweets, pd.concat([sample_replies, sample_quotes]))

### Add percentage of replies/quotes with similar sentiment to the reference tweet

In [None]:
sample_tweets['pct_similar_sentiment_replies'] = add_pct_sentiment_category(sample_tweets, sample_replies)

In [None]:
sample_tweets['pct_similar_sentiment_quotes'] = add_pct_sentiment_category(sample_tweets, sample_quotes)

In [None]:
sample_tweets.head(3)

In [None]:
sample_replies.head(3)

In [None]:
sample_quotes.head(3)

### text cleaning

In [None]:
sample_tweets['clean_text'] = add_clean_text(sample_tweets)

In [None]:
sample_replies['clean_text'] = add_clean_text(sample_replies)

In [None]:
sample_quotes['clean_text'] =  add_clean_text(sample_quotes)

### Add tweet to conversation Notice, add clean text to conversation dataframe first!

In [None]:
sample_replies['clean_tweet'] = add_tweet_to_conversation(sample_tweets, sample_replies)

In [None]:
sample_quotes['clean_tweet'] = add_tweet_to_conversation(sample_tweets, sample_quotes)

### Add sentiment similarity to conversations

In [None]:
sample_quotes['semantic_similarity'] = add_tweet_conv_semantic_similarity(sample_quotes['clean_tweet'], sample_quotes['clean_text'])

In [None]:
sample_replies['semantic_similarity'] = add_tweet_conv_semantic_similarity(sample_replies['clean_tweet'], sample_replies['clean_text'])

In [None]:
sample_replies.to_csv('sample_replies_final_version.csv', index=False)

In [None]:
sample_quotes.to_csv('sample_quotes_final_version.csv', index=False)

### Add variance of semantic similarity scores 

In [None]:
sample_tweets['var_replies_semantic'] = add_variance_of_conversation_sentiment(sample_tweets, sample_replies)

In [None]:
sample_tweets['var_quotes_semantic'] = add_variance_of_conversation_sentiment(sample_tweets, sample_quotes)

In [None]:
sample_tweets['var_conversation_semantic'] = add_variance_of_conversation_sentiment(sample_tweets, pd.concat([sample_quotes, sample_replies]))

In [None]:
sample_tweets.to_csv('sample_tweets_final_version.csv', index=False)