# ETL of Pipeline 2 Data for Visualization

In [2]:
import pandas as pd
from joblib import dump, load
import numpy as np

In [15]:
import joblib
joblib.__version__

'1.1.0'

In [3]:
# Declare constants

MODEL_DIR_NAMES = ['linear_svc', 'multi_nb']
MODEL_FILE_NAMES = ['linearSVC', 'multinomialNB']
VIZ_2_DATA_LOC = '../../../visualizations/part2/data' # UGLYYYY, want to reference based on root (dev/)

# Word vectors
WORD_VECS_LOC = './data/embeddings/russia_ukraine_words.csv'
WORDS_DISTRIB_OUTPUT_LOC = VIZ_2_DATA_LOC + 'words_sentiment_distrib.csv'
CLEAN_WORDS_OUTPUT_LOC = VIZ_2_DATA_LOC + 'cleaned_words.csv'

# Transformed tweets
TRANSFORMED_TWEETS_LOC = './data/transformed/russia_ukraine_sentiment.csv'
TWEETS_DISTRIB_OUTPUT_LOC = VIZ_2_DATA_LOC + 'russia_ukraine_sentiment_distrib.csv'
CLEAN_TWEETS_OUTPUT_LOC = VIZ_2_DATA_LOC + 'cleaned_russia_ukraine_tweets.csv'

# Top 10 Hashtags
HASHTAG_OUTPUT_LOC = VIZ_2_DATA_LOC + 'top_10_russia_ukraine_hashtags.csv'


MODEL_NAMES = list(zip(MODEL_DIR_NAMES, MODEL_FILE_NAMES))
# Metrics
METRIC_LOCS = { d_name: f'./data/metrics/{d_name}/{f_name}_metrics.joblib' for d_name, f_name in MODEL_NAMES } 
TRANSFORMED_METRIC_LOCS = { d_name: f'{VIZ_2_DATA_LOC}/metrics/{d_name}/{f_name}_metrics.json' for d_name, f_name in MODEL_NAMES}
# CLEAN_WORDS_OUTPUT_LOC = VIZ_1_DATA_LOC + 'cleaned_words.csv'

# CV_Scores
CV_LOCS = { d_name: f'./data/cv_scores/{d_name}/{f_name}_cv_scores.joblib' for d_name, f_name in MODEL_NAMES } 
TRANSFORMED_CV_LOCS = { d_name: f'{VIZ_2_DATA_LOC}/cv_scores/{d_name}/{f_name}_cv_scores.json' for d_name, f_name in MODEL_NAMES}

### `metrics/` transformation

In [4]:
def pprintMetrics(metrics_dict):
    TAB_SIZE = 4

    for name, values in metrics_dict.items():
        print(name)
        line_breaks = 0
        added_tabs = 0
        if isinstance(values, (tuple, list)):
            svc_res, nb_res = values
            if isinstance(values[0], tuple):
                pass
            elif isinstance(values[0], np.ndarray) and not (len(values[0].shape) == 1 or values[0].shape[1] == 1):
                line_breaks += 1
                # added_tabs += 1
            elif isinstance(values[0], dict):
                line_breaks += 1
                svc_res, nb_res = '', ''
                for i, d in enumerate(values):
                    out_str = ''
                    key_lengths = list(map(len, d.keys()))
                    min_key, max_key = key_lengths[0], key_lengths[-1]
                    
                    min_tabs, max_tabs = (min_key // TAB_SIZE), (max_key // TAB_SIZE)

                    scaled_keys = list(map(lambda x: (x // TAB_SIZE)-1, key_lengths))
                    tab_map = np.clip(np.abs(np.array(scaled_keys)-2), a_min=min_tabs+1, a_max=max_tabs-1)

                    
                    for j, (key, vals) in enumerate(d.items()):
                        out_str += str(key) + ('\t' * tab_map[j])
                        if isinstance(vals, dict):
                            out_str += ('  '.join([f'{k}: {v}' for k, v in vals.items()]))
                        else:
                            out_str += str(vals)
                        
                        out_str += '\n'

                    if i == 0:
                        svc_res += out_str
                    else:
                        nb_res += out_str
                    
                

            svc_out = ' linear svc\t' + ('\n' * line_breaks) + ('\t' * added_tabs) + str(svc_res)
            nb_out = ' multi_nb\t' + ('\n' * line_breaks) + ('\t' * added_tabs) + str(nb_res)
            
            print(svc_out)
            print(nb_out)

        else:
            print(values)
    
        print()

In [5]:
# Import defined metrics
raw_metrics = { name: load(path) for name, path in METRIC_LOCS.items() }

In [6]:
# Format imported metrics
flattened_metrics = {}
for model_name, metrics in raw_metrics.items():
    model_out = {}
    for metric in metrics:
        if isinstance(metric, list):
            metric = metric[0]
        if not isinstance(metric, dict):
            model_out = metric
        else:
            model_out[metric.get('name')] = metric.get('result')
    flattened_metrics[model_name] = model_out

grouped_metrics = list(zip(*[x.items() for x in flattened_metrics.values()]))
metrics_lookup = { z[0][0]: z[1] for z in [list(zip(*a)) for a in grouped_metrics] }

pprintMetrics(metrics_lookup)

CV Classification - accuracy
 linear svc	[0.95298281 0.95500506 0.9580172  0.95599393 0.9549823 ]
 multi_nb	[0.94034378 0.94034378 0.94334851 0.94638341 0.94284269]

Confusion Matrix
 linear svc	
[[ 384    0    0]
 [   0  295    0]
 [   0    0 9208]]
 multi_nb	
[[ 384    0    0]
 [   0  295    0]
 [   0    0 9208]]

Classification Report
 linear svc	
-1		precision: 1.0  recall: 1.0  f1-score: 1.0  support: 384
0		precision: 1.0  recall: 1.0  f1-score: 1.0  support: 295
1		precision: 1.0  recall: 1.0  f1-score: 1.0  support: 9208
accuracy	1.0
macro avg	precision: 1.0  recall: 1.0  f1-score: 1.0  support: 9887
weighted avg	precision: 1.0  recall: 1.0  f1-score: 1.0  support: 9887

 multi_nb	
-1		precision: 1.0  recall: 1.0  f1-score: 1.0  support: 384
0		precision: 1.0  recall: 1.0  f1-score: 1.0  support: 295
1		precision: 1.0  recall: 1.0  f1-score: 1.0  support: 9208
accuracy	1.0
macro avg	precision: 1.0  recall: 1.0  f1-score: 1.0  support: 9887
weighted avg	precision: 1.0  recall: 1

In [7]:
assign_label = lambda metric_name: tuple([(model_name, val) for model_name, val in zip(raw_metrics.keys(), metrics_lookup[metric_name])])
labeled_metrics = { k: assign_label(k) for k in metrics_lookup }
labeled_metrics

{'CV Classification - accuracy': (('linear_svc',
   array([0.95298281, 0.95500506, 0.9580172 , 0.95599393, 0.9549823 ])),
  ('multi_nb',
   array([0.94034378, 0.94034378, 0.94334851, 0.94638341, 0.94284269]))),
 'Confusion Matrix': (('linear_svc',
   array([[ 384,    0,    0],
          [   0,  295,    0],
          [   0,    0, 9208]])),
  ('multi_nb',
   array([[ 384,    0,    0],
          [   0,  295,    0],
          [   0,    0, 9208]]))),
 'Classification Report': (('linear_svc',
   {'-1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 384},
    '0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 295},
    '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 9208},
    'accuracy': 1.0,
    'macro avg': {'precision': 1.0,
     'recall': 1.0,
     'f1-score': 1.0,
     'support': 9887},
    'weighted avg': {'precision': 1.0,
     'recall': 1.0,
     'f1-score': 1.0,
     'support': 9887}}),
  ('multi_nb',
   {'-1': {'precision': 1.0, 're

### `cv_scores/` transformation

In [11]:
METRIC_LOCS

{'linear_svc': './data/metrics/linear_svc/linearSVC_metrics.joblib',
 'multi_nb': './data/metrics/multi_nb/multinomialNB_metrics.joblib'}

In [10]:
CV_LOCS

{'linear_svc': './data/cv_scores/linear_svc/linearSVC_cv_scores.joblib',
 'multi_nb': './data/cv_scores/multi_nb/multinomialNB_cv_scores.joblib'}

In [12]:
# Import defined metrics
# raw_cross_val = { name: load(path) for name, path in CV_LOCS.items() }

p = './data/cv_scores/linear_svc/linearSVC_cv_scores.joblib'
load(p)

KeyError: 44

### Word & Tweet Sentiment

In [3]:
# Helper function

def formatPipelineOutput(df, drop_cols=[], col_mappings={}):
    # Drop designated columns
    clean_df = df.drop(columns=drop_cols).fillna('') # May need to do more cleaning than this...

    # Collect sentiment value counts
    summary_df = pd.DataFrame(clean_df['sentiment'].value_counts())
    
    # Format dataframe for output
    summary_df = summary_df.reset_index().rename(columns=col_mappings)

    # Build column representing each sentiment values "part of the whole"
    total_values = sum(summary_df['count'])
    summary_df['percent'] = summary_df['count'] / total_values * 100

    return clean_df, summary_df

In [4]:
# Import word vectors
words_df = pd.read_csv(WORD_VECS_LOC)
words_df.head()

Unnamed: 0.1,Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff,sentiment
0,0,russia,[ 0.14867705 0.17193353 -0.12731107 0.033149...,0,1,18.919055,18.919055,positive
1,1,ukraine,[ 1.53168425e-01 1.72282770e-01 -1.33238226e-...,0,1,17.526398,17.526398,positive
2,2,war,[ 1.54297590e-01 1.70151323e-01 -1.26313880e-...,0,1,16.369316,16.369316,positive
3,3,"""",[ 0.1485806 0.16643363 -0.12510231 0.036086...,0,1,16.73749,16.73749,positive
4,4,putin,[ 1.62040293e-01 1.74694419e-01 -1.27796173e-...,0,1,19.686625,19.686625,positive


In [5]:
# Get data visualization dataframes
col_mappings = {
    'index': 'sentiment', 
    'sentiment': 'count'
}
drop_cols = ['Unnamed: 0', 'vectors', 'cluster']

clean_words_df, words_distrib_df = formatPipelineOutput(words_df, drop_cols, col_mappings)

In [6]:
# Output to visualizations/
words_distrib_df.to_csv(WORDS_DISTRIB_OUTPUT_LOC)
clean_words_df.to_csv(CLEAN_WORDS_OUTPUT_LOC)

In [7]:
# Import transformed tweets
tweets_df = pd.read_csv(TRANSFORMED_TWEETS_LOC)
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,date,username,retweets,tweet,hashtags,clean_tweet_words,clean_tweet,day,month,sentiment_val,sentiment
0,0,4/23/22,voidbourn,0,@El_Was_Taken @mariya_GuO @jacksonhinklle This...,,"['take', 'guo', 'russia', 'usa', 'permanent', ...",take guo russia usa permanent seat unite natio...,23,4,1,positive
1,1,4/23/22,applekappa1337,0,"@fedtanyl Thomas Friedman sucks, but the artic...",,"['thomas', 'friedman', 'suck', 'article', 'sim...",thomas friedman suck article simp authoritaria...,23,4,1,positive
2,2,4/23/22,mbw955,0,@pl4ma @TKensingtonian @freedomrideblog Not do...,,"['downplay', 'nazi', 'russian', 'aggression', ...",downplay nazi russian aggression greater russi...,23,4,1,positive
3,3,4/23/22,shodanette,0,@Rimlee18 @_Chosokaba @gadhi_minosh @KittBarte...,,"['chosokaba', 'minosh', 'trade', 'agreement', ...",chosokaba minosh trade agreement equal hence u...,23,4,1,positive
4,4,4/23/22,chilberg11,0,@InnaSovsun Russia won't stop at Transnistria....,,"['russia', 'stop', 'transnistria', 'putin', 'c...",russia stop transnistria putin claim moldova u...,23,4,1,positive


In [8]:
# Get data visualization dataframes
col_mappings = {
    'index': 'sentiment',
    'sentiment': 'count'
}
drop_cols = ['Unnamed: 0', 'date', 'tweet', 'retweets', 'clean_tweet_words']


clean_tweets_df, tweets_distrib_df = formatPipelineOutput(tweets_df, drop_cols, col_mappings)

In [9]:
# Output to visualizations/
tweets_distrib_df.to_csv(TWEETS_DISTRIB_OUTPUT_LOC)
clean_tweets_df.to_csv(CLEAN_TWEETS_OUTPUT_LOC)

In [10]:
clean_tweets_df.head(5)

Unnamed: 0,username,hashtags,clean_tweet,day,month,sentiment_val,sentiment
0,voidbourn,,take guo russia usa permanent seat unite natio...,23,4,1,positive
1,applekappa1337,,thomas friedman suck article simp authoritaria...,23,4,1,positive
2,mbw955,,downplay nazi russian aggression greater russi...,23,4,1,positive
3,shodanette,,chosokaba minosh trade agreement equal hence u...,23,4,1,positive
4,chilberg11,,russia stop transnistria putin claim moldova u...,23,4,1,positive


In [11]:
filtered_hashtags = clean_tweets_df[clean_tweets_df['hashtags'] != ''][['hashtags', 'sentiment']]
filtered_hashtags.head(5)

Unnamed: 0,hashtags,sentiment
8,ukraine russia economy,positive
9,ukraine russia,positive
21,unitednation russia ukrainewar,positive
26,russia ukraine,positive
33,russia russianukrainianwar russiannatowa...,positive


In [12]:
filtered_hashtags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1716 entries, 8 to 9886
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   hashtags   1716 non-null   object
 1   sentiment  1716 non-null   object
dtypes: object(2)
memory usage: 40.2+ KB


In [15]:
split_hashtags = filtered_hashtags.set_index('sentiment') \
                                    .apply(lambda x: x.str.split('    ').explode()) \
                                    .reset_index()  
split_hashtags['hashtags'] = split_hashtags['hashtags'].replace(r'^\s*$', np.nan, regex=True).str.strip()
split_hashtags = split_hashtags.rename(columns={'hashtags': 'hashtag'}).dropna()
split_hashtags.head()

Unnamed: 0,sentiment,hashtag
0,positive,ukraine
1,positive,russia
2,positive,economy
3,positive,ukraine
4,positive,russia


In [16]:
split_hashtags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6158 entries, 0 to 6282
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  6158 non-null   object
 1   hashtag    6158 non-null   object
dtypes: object(2)
memory usage: 144.3+ KB


In [17]:
cumulative_counts = split_hashtags['hashtag'].groupby(split_hashtags['hashtag']).count()
split_hashtags['total_count'] = split_hashtags['hashtag'].map(cumulative_counts)
split_hashtags.head()

Unnamed: 0,sentiment,hashtag,total_count
0,positive,ukraine,675
1,positive,russia,1006
2,positive,economy,3
3,positive,ukraine,675
4,positive,russia,1006


In [18]:
hashtag_sentiments = split_hashtags.groupby(['hashtag', 'sentiment']).value_counts().to_frame()
hashtag_sentiments = hashtag_sentiments.stack().reset_index().rename(columns={0: 'sentiment_count'}).drop(columns=['level_3'])
# grouped_hashtags = grouped_hashtags.rename(columns={0: 'count'}).sort_values('total_count', ascending=False)
# grouped_hashtags = grouped_hashtags.set_index(['hashtag', 'sentiment']).drop(columns=['level_4'])#, 'index'])
hashtag_sentiments.head()

Unnamed: 0,hashtag,sentiment,total_count,sentiment_count
0,a,positive,1,1
1,abiyahmedali,positive,1,1
2,abramov,positive,2,2
3,acr,positive,1,1
4,advancement,positive,1,1


In [19]:
hashtag_sentiments['freq'] = ((hashtag_sentiments['sentiment_count'] / hashtag_sentiments['sentiment_count'].sum()) * 100)
hashtag_sentiments

Unnamed: 0,hashtag,sentiment,total_count,sentiment_count,freq
0,a,positive,1,1,0.016239
1,abiyahmedali,positive,1,1,0.016239
2,abramov,positive,2,2,0.032478
3,acr,positive,1,1,0.016239
4,advancement,positive,1,1,0.016239
...,...,...,...,...,...
1547,zelenskyywasright,positive,1,1,0.016239
1548,zionist,positive,2,2,0.032478
1549,zmap,positive,2,2,0.032478
1550,zoom,positive,1,1,0.016239


In [20]:
grouped_hashtags = hashtag_sentiments.set_index(['hashtag', 'sentiment']).sort_values('total_count', ascending=False)

idx_slice = [grouped_hashtags.index.get_level_values(0).unique()[i] for i in range(10)]
top_10_hashtags = grouped_hashtags.loc[idx_slice]
top_10_hashtags

Unnamed: 0_level_0,Unnamed: 1_level_0,total_count,sentiment_count,freq
hashtag,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
russia,neutral,1006,60,0.974342
russia,positive,1006,902,14.647613
russia,negative,1006,44,0.714518
ukraine,negative,675,38,0.617083
ukraine,neutral,675,47,0.763235
ukraine,positive,675,590,9.581033
putin,neutral,208,36,0.584605
putin,negative,208,29,0.470932
putin,positive,208,143,2.322183
usa,negative,85,1,0.016239


In [21]:
output = top_10_hashtags.reset_index().drop(columns='total_count')
output.to_csv(HASHTAG_OUTPUT_LOC)