# Analysis adjectives

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from createDataset.params import *
import os
import json
import pandas as pd

In [2]:
LOCAL_PATH = "../data/"
ADJ_MALE = os.path.join(LOCAL_PATH, "count_male_adjectives.json")
ADJ_FEM = os.path.join(LOCAL_PATH, "count_female_adjectives.json")

In [3]:
# create the session
spark = SparkSession.builder.getOrCreate()
# create the context
sc = spark.sparkContext

most_common_adj_male = spark.read.json(ADJ_MALE)
most_common_adj_fem = spark.read.json(ADJ_FEM)

In [6]:
most_common_adj_male = most_common_adj_male.orderBy(desc("count"))
most_common_adj_male.show()

+----------+-----+
|adjectives|count|
+----------+-----+
|     right|59358|
|     large|38046|
|   popular|32918|
|   notable|29582|
|      best|28404|
|     major|28211|
|    active|26180|
|      high|24815|
|    famous|18211|
|successful|17535|
|      long|17157|
|     great|16970|
| prominent|16280|
| important|16050|
|      free|15042|
|    modern|14615|
|     civil|11679|
|  original|11091|
|   olympic| 9942|
|     minor| 9908|
+----------+-----+
only showing top 20 rows



In [8]:
most_common_adj_fem = most_common_adj_fem.orderBy(desc("count"))
most_common_adj_fem.show()

+-----------+-----+
| adjectives|count|
+-----------+-----+
|    popular| 8720|
|       best| 8010|
|    notable| 5832|
|      large| 5139|
|       high| 4924|
|    olympic| 4576|
|     active| 4430|
|       long| 4405|
|      major| 4050|
|      right| 3654|
| successful| 3305|
|     famous| 3302|
|competitive| 2864|
|      great| 2681|
|  prominent| 2456|
|   original| 2364|
|     modern| 2145|
|independent| 2118|
|      vocal| 1996|
| democratic| 1911|
+-----------+-----+
only showing top 20 rows



In [9]:
subjectivity_dictionary = {}
    
with open('../data/subjectivity_dictionary.json', 'r') as json_file:
    for item in eval(json_file.readline()):
        subjectivity_dictionary.update({item['word']: (item['strength'], item['subj'])})

In [18]:
def get_subjectivity(adj):
    return subjectivity_dictionary.get(adj)[1]

def get_strength(adj):
    return subjectivity_dictionary.get(adj)[0]

In [19]:
udf_get_subj = udf(get_subjectivity)
udf_get_strength = udf(get_strength)

most_common_adj_male = most_common_adj_male.withColumn("subjectivity", udf_get_subj("adjectives"))
most_common_adj_fem = most_common_adj_fem.withColumn("subjectivity", udf_get_subj("adjectives"))
most_common_adj_male = most_common_adj_male.withColumn("strength", udf_get_strength("adjectives"))
most_common_adj_fem = most_common_adj_fem.withColumn("strength", udf_get_strength("adjectives"))

In [20]:
most_common_adj_male.show()

+----------+-----+------------+----------+
|adjectives|count|subjectivity|  strength|
+----------+-----+------------+----------+
|     right|59358|    positive|strongsubj|
|     large|38046|    positive|  weaksubj|
|   popular|32918|    positive|  weaksubj|
|   notable|29582|    positive|strongsubj|
|      best|28404|    positive|strongsubj|
|     major|28211|     neutral|  weaksubj|
|    active|26180|    positive|  weaksubj|
|      high|24815|     neutral|  weaksubj|
|    famous|18211|    positive|  weaksubj|
|successful|17535|    positive|  weaksubj|
|      long|17157|    negative|strongsubj|
|     great|16970|    positive|strongsubj|
| prominent|16280|    positive|  weaksubj|
| important|16050|    positive|  weaksubj|
|      free|15042|    positive|  weaksubj|
|    modern|14615|    positive|  weaksubj|
|     civil|11679|    positive|  weaksubj|
|  original|11091|    positive|  weaksubj|
|   olympic| 9942|     neutral|  weaksubj|
|     minor| 9908|     neutral|  weaksubj|
+----------

In [21]:
most_common_adj_fem.show()

+-----------+-----+------------+----------+
| adjectives|count|subjectivity|  strength|
+-----------+-----+------------+----------+
|    popular| 8720|    positive|  weaksubj|
|       best| 8010|    positive|strongsubj|
|    notable| 5832|    positive|strongsubj|
|      large| 5139|    positive|  weaksubj|
|       high| 4924|     neutral|  weaksubj|
|    olympic| 4576|     neutral|  weaksubj|
|     active| 4430|    positive|  weaksubj|
|       long| 4405|    negative|strongsubj|
|      major| 4050|     neutral|  weaksubj|
|      right| 3654|    positive|strongsubj|
| successful| 3305|    positive|  weaksubj|
|     famous| 3302|    positive|  weaksubj|
|competitive| 2864|    positive|  weaksubj|
|      great| 2681|    positive|strongsubj|
|  prominent| 2456|    positive|  weaksubj|
|   original| 2364|    positive|  weaksubj|
|     modern| 2145|    positive|  weaksubj|
|independent| 2118|    positive|  weaksubj|
|      vocal| 1996|     neutral|  weaksubj|
| democratic| 1911|    positive|

In [22]:
WIKI_MALE = os.path.join(LOCAL_PATH, "wikipedia_male_adjectives.json")
WIKI_FEM = os.path.join(LOCAL_PATH, "wikipedia_female_adjectives.json")

In [23]:
df_male = spark.read.json(WIKI_MALE)
df_fem = spark.read.json(WIKI_FEM)

In [25]:
def get_nb_adjs(list_adj):
    return len(list_adj)

In [30]:
udf_get_nb_adjs = udf(get_nb_adjs)

df_male = df_male.withColumn("nb-adjs", udf_get_nb_adjs("adjectives"))
df_fem = df_fem.withColumn("nb-adjs", udf_get_nb_adjs("adjectives"))

In [44]:
print("MALE")
df_male.agg(mean(col("nb-adjs")), stddev(col("nb-adjs"))).show()

print("FEMALE")
df_fem.agg(mean(col("nb-adjs")), stddev(col("nb-adjs"))).show()

MALE
+------------------+--------------------+
|      avg(nb-adjs)|stddev_samp(nb-adjs)|
+------------------+--------------------+
|2.1237752477189096|  1.9709265284971673|
+------------------+--------------------+

FEMALE
+------------------+--------------------+
|      avg(nb-adjs)|stddev_samp(nb-adjs)|
+------------------+--------------------+
|1.9299210914103722|  1.6754143543180764|
+------------------+--------------------+

