# Analysis adjectives

In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from createDataset.params import *
import os
import json
import pandas as pd

In [None]:
LOCAL_PATH = "../data/"
ADJ_MALE = os.path.join(LOCAL_PATH, "count_male_adjectives.json")
ADJ_FEM = os.path.join(LOCAL_PATH, "count_female_adjectives.json")

In [None]:
# create the session
spark = SparkSession.builder.getOrCreate()
# create the context
sc = spark.sparkContext

most_common_adj_male = spark.read.json(ADJ_MALE)
most_common_adj_fem = spark.read.json(ADJ_FEM)

In [None]:
most_common_adj_male = most_common_adj_male.orderBy(desc("count"))
most_common_adj_male.show()

In [None]:
most_common_adj_fem = most_common_adj_fem.orderBy(desc("count"))
most_common_adj_fem.show()

In [None]:
subjectivity_dictionary = {}
    
with open('../data/subjectivity_dictionary.json', 'r') as json_file:
    for item in eval(json_file.readline()):
        subjectivity_dictionary.update({item['word']: (item['strength'], item['subj'])})

In [None]:
def get_subjectivity(adj):
    return subjectivity_dictionary.get(adj)[1]

def get_strength(adj):
    return subjectivity_dictionary.get(adj)[0]

In [None]:
udf_get_subj = udf(get_subjectivity)
udf_get_strength = udf(get_strength)

most_common_adj_male = most_common_adj_male.withColumn("subjectivity", udf_get_subj("adjectives"))
most_common_adj_fem = most_common_adj_fem.withColumn("subjectivity", udf_get_subj("adjectives"))
most_common_adj_male = most_common_adj_male.withColumn("strength", udf_get_strength("adjectives"))
most_common_adj_fem = most_common_adj_fem.withColumn("strength", udf_get_strength("adjectives"))

In [None]:
most_common_adj_male.show()

In [None]:
most_common_adj_fem.show()

In [None]:
WIKI_MALE = os.path.join(LOCAL_PATH, "wikipedia_male_adjectives.json")
WIKI_FEM = os.path.join(LOCAL_PATH, "wikipedia_female_adjectives.json")

In [None]:
df_male = spark.read.json(WIKI_MALE)
df_fem = spark.read.json(WIKI_FEM)

In [None]:
def get_nb_adjs(list_adj):
    return len(list_adj)

In [None]:
udf_get_nb_adjs = udf(get_nb_adjs)

df_male = df_male.withColumn("nb-adjs", udf_get_nb_adjs("adjectives"))
df_fem = df_fem.withColumn("nb-adjs", udf_get_nb_adjs("adjectives"))

In [None]:
print("MALE")
df_male.agg(mean(col("nb-adjs")), stddev(col("nb-adjs"))).show()

print("FEMALE")
df_fem.agg(mean(col("nb-adjs")), stddev(col("nb-adjs"))).show()

## Word cloud for the most common adjectives

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import numpy as np
from PIL import Image

In [None]:
adj_male_dict = most_common_adj_male.select('adjectives', 'count').toPandas().set_index('adjectives').T.to_dict('records')
adj_female_dict = most_common_adj_fem.select('adjectives', 'count').toPandas().set_index('adjectives').T.to_dict('records')

In [None]:
# map subjectity of words to colors
word_to_color = dict()

for word in subjectivity_dictionary:
    if subjectivity_dictionary[word][1] == "positive":
        word_to_color[word] = 'forestgreen' 
    if subjectivity_dictionary[word][1] == "negative":
        word_to_color[word] = 'crimson' 
    if subjectivity_dictionary[word][1] == "neutral":
        word_to_color[word] = 'grey' 

def color_func(word, *args, **kwargs):
    try:
        color = word_to_color[word]
    except KeyError:
        color = '#000000' # black
    return color


In [None]:
male_mask = np.array(Image.open("male.png"))

wc = WordCloud(background_color="white", max_words=500, mask=male_mask, 
               contour_width=3, contour_color='peru', color_func=color_func)

# generate word cloud
wc.generate_from_frequencies(adj_male_dict[0])

# store to file
# wc.to_file(path.join(d, "male_adj.png"))

# show
plt.figure(figsize=[20,10])
plt.imshow(wc, cmap=plt.cm.gray, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
female_mask = np.array(Image.open("female.png"))

wc = WordCloud(background_color="white", max_words=500, mask=female_mask, 
               contour_width=3, contour_color='peru', color_func=color_func)
# generate word cloud
wc.generate_from_frequencies(adj_female_dict[0])

# store to file
# wc.to_file(path.join(d, "female_adj.png"))

# show
plt.figure(figsize=[20,10])
plt.imshow(wc, cmap=plt.cm.gray, interpolation="bilinear")
plt.axis("off")
plt.show()

## Compare the strong positive/negative adjectives in the overviews

In [None]:
most_common_adj_male.show()

In [None]:
overview_subjectivity_male = most_common_adj_male.groupBy('strength', 'subjectivity').\
agg(sum('count').alias('sum_')).orderBy(desc('sum_'))

overview_subjectivity_male = overview_subjectivity_male.replace('', 'None')
overview_subjectivity_male.show()


overview_subjectivity_female = most_common_adj_fem.groupBy('strength', 'subjectivity').\
agg(sum('count').alias('sum_')).orderBy(desc('sum_'))

overview_subjectivity_female = overview_subjectivity_female.replace('', 'None')
overview_subjectivity_female.show()

In [None]:
strong_adj_male = most_common_adj_male.where((col("strength") == "strongsubj")).\
groupBy('strength', 'subjectivity').agg({'count':'sum'}).\
where((col("subjectivity") == "positive") | (col("subjectivity") == "negative"))

strong_adj_male = strong_adj_male.\
withColumn("percentage", 100*strong_adj_male['sum(count)']/ most_common_adj_male.agg({'count':'sum'}).collect()[0][0])

strong_adj_male.show()


strong_adj_female = most_common_adj_fem.where((col("strength") == "strongsubj")).\
groupBy('strength', 'subjectivity').agg({'count':'sum'}).\
where((col("subjectivity") == "positive") | (col("subjectivity") == "negative"))

strong_adj_female = strong_adj_female.\
withColumn("percentage", 100*strong_adj_female['sum(count)']/ most_common_adj_fem.agg({'count':'sum'}).collect()[0][0])


strong_adj_female.show()

In [None]:
most_common_adj_male.where((col("strength") == "strongsubj") & (col("subjectivity") == 'positive')).show()

In [None]:
most_common_adj_male.where((col("strength") == "strongsubj") & (col("subjectivity") == 'negative')).show()

In [None]:
strong_pos_male = most_common_adj_male \
                    .where((col("strength") == "strongsubj") & (col("subjectivity") == 'positive')) \
                    .count()

strong_neg_male = most_common_adj_male \
                    .where((col("strength") == "strongsubj") & (col("subjectivity") == 'negative')) \
                    .count()

