In [1]:
# Load the packages that need to be used
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from wordcloud import WordCloud
import wordcloud
import utils
from nltk.sentiment import SentimentIntensityAnalyzer
from nrclex import NRCLex
from platform import platform
import re

In [2]:
# nltk.downloader.download('vader_lexicon')

In [3]:
# Create a general path for future reproducibility 
path = "/Users/oliviaz/Documents/GitHub/ads-spring2023-project1-mmmzzss"

df_raw = pd.read_csv(path + "/data/philosophy_data.csv")

# Explortary Data Analysis - Whole Dataset

### Author: Jingshu Zhang 
### UNI: jz3552

##### Data Preprocessing and Checking

In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360808 entries, 0 to 360807
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   title                      360808 non-null  object
 1   author                     360808 non-null  object
 2   school                     360808 non-null  object
 3   sentence_spacy             360808 non-null  object
 4   sentence_str               360808 non-null  object
 5   original_publication_date  360808 non-null  int64 
 6   corpus_edition_date        360808 non-null  int64 
 7   sentence_length            360808 non-null  int64 
 8   sentence_lowered           360808 non-null  object
 9   tokenized_txt              360808 non-null  object
 10  lemmatized_str             360808 non-null  object
dtypes: int64(3), object(8)
memory usage: 30.3+ MB


##### From the info we have above, the values in all the columns are non-null and therefore make sure that the dataset is complete

## Frequency of authors and schools - Degree of influence

##### We look at the number of frequency of publication across authors and schools to see the contribution by using matplotlib and seaborn library.

In [1]:
fig,ax = plt.subplots()
fig.set_size_inches(10,8)
plt.title('Freq of Publication by Author',fontweight = 'bold')
ax.set_xlabel('Count', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('Author', fontsize=12, labelpad=10, fontweight='bold')
sns.countplot(y = 'author', data = df_raw, order = df_raw['author'], palette = 'blend:#7AB,#EDA')

plt.savefig(path + '/figs/Freq_By_Author.jpg');

NameError: name 'plt' is not defined

In [None]:
fig,ax = plt.subplots()
fig.set_size_inches(10,8)
plt.title('Freq of Publication by School',fontweight = 'bold')
ax.set_yticklabels(ax.get_xticklabels(), rotation=45, size = 10)
ax.set_xlabel('Count', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('School', fontsize=12, labelpad=10, fontweight='bold')
sns.countplot(y = 'school', data = df_raw, order = df_raw['school'].value_counts().index, palette = 'blend:#7AB,#EDA')

plt.savefig(path + '/figs/Freq_By_School.jpg');

##### The chart indicates that the work of Aristotle, Plato, and Hegel is significantly greater than that of other authors. 

## Average length of sentences across Authors and Schools - Writing style

##### The average length of sentences is visualized to find the different writing style - verbose or concise

In [None]:
df_senten_avglength = df_raw.groupby(['author'])['sentence_length'].mean().reset_index()
df_senten_avglength.head(10)

In [None]:
fig,ax = plt.subplots()
fig.set_size_inches(10,8)
sns.barplot(y = 'author', x = 'sentence_length', data = df_senten_avglength, order= df_senten_avglength.sort_values('sentence_length', ascending=False).author,palette = 'blend:#7AB,#EDA')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, size = 10)
ax.set_xlabel('Author', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('sentence_avglength', fontsize=12, labelpad=10, fontweight='bold')
plt.title('Average sentence length across Authors', fontweight = 'bold')

plt.savefig(path + '/figs/Avgsentlen_Author.jpg');

In [None]:
avgsentlen_school = df_raw.groupby(['school'])['sentence_length'].mean().reset_index()
avgsentlen_school.head(10)

In [None]:
fig,ax = plt.subplots()
fig.set_size_inches(10,8)
sns.barplot(y = 'school', x = 'sentence_length', data = avgsentlen_school, order= avgsentlen_school.sort_values('sentence_length', ascending=False).school,palette = 'blend:#7AB,#EDA')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, size = 8)
ax.set_xlabel('School', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('sentence_avglength', fontsize=12, labelpad=10, fontweight='bold')
plt.title('Average sentence length across Schools', fontweight = 'bold')

plt.savefig(path + '/figs/Avgsentlen_Schools.jpg');

## Compound Sentiment Score by School over time

##### Only looking at some basic characteristics of the author and school might not be enough to deeply understand the feminism, so I decided to explore the compound store in different schools.

##### This code performs sentiment analysis on the sentences in the 'sentence_str' column of the 'df_raw' dataframe using the SentimentIntensityAnalyzer from the nltk package. The sentiment analysis is performed by calculating the negative, neutral, positive, and a compound score which ranges from -1  to 1. The compound score is a metric that represents the overall sentiment of the sentence
##### Additionally, the 'sentence_spacy','corpus_edition_date','sentiment_scores' , and 'sentiment_lowered'columns are also dropped as they are no longer in use.

In [None]:
sia = SentimentIntensityAnalyzer()

# Create a new column to store the sentiment scores
df_raw['sentiment_scores'] = df_raw['sentence_str'].apply(lambda x: sia.polarity_scores(x))

# Extract the values from the sentiment_scores dictionary and assign them to new columns
# Compound score ranges from -1 (neg) through 0 (neu) to 1 (pos) and we only keep the compound_score col for future analysis
df_raw['compound_score'] = df_raw['sentiment_scores'].apply(lambda x: x['compound'])

# Delete irrelavant columns
df_raw.drop(columns=['sentence_spacy','corpus_edition_date','sentiment_scores', 'sentence_lowered'], inplace=True)

# Rename 'original_publication_date' column to 'year' for easy operation
df_raw.rename(columns = {'original_publication_date':'year'}, inplace = True)

df_raw.head(10)

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(18, 18))

axes[0,0].set_title('Analytic');
axes[0,1].set_title('Continental');
axes[0,2].set_title('Empiricism');
axes[1,0].set_title('Phenomenology');
axes[1,1].set_title('Stoicism');
axes[1,2].set_title('Communism');
axes[2,0].set_title('Feminism');
axes[2,1].set_title('Capitalism');
axes[2,2].set_title('Rationalism');


sns.lineplot(ax = axes[0,0], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'analytic']);
sns.lineplot(ax = axes[0,1], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'continental']);
sns.lineplot(ax = axes[0,2], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'empiricism']);
sns.lineplot(ax = axes[1,0], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'phenomenology']);
sns.lineplot(ax = axes[1,1], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'stoicism']);
sns.lineplot(ax = axes[1,2], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'communism']);
sns.lineplot(ax = axes[2,0], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'feminism']);
sns.lineplot(ax = axes[2,1], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'capitalism']);
sns.lineplot(ax = axes[2,2], x='year', y= 'compound_score', data = df_raw[df_raw['school'] == 'rationalism']);

plt.savefig(path + '/figs/Sentiment_score_overtime.jpg');

##### I noticed that Feminism is the only one whose compound score is still below -0.05 and regard as negative up until the most recent publication. It attracts me to further examine how the author express and what they cover in their publications.

# Exploratory Data Analysis - School of Feminism

## Contribuition of authors in Feminism

##### First look at who are the contributors to Feminism by filtering the dataset with school being feminism

In [None]:
df_fem = df_raw[df_raw['school'] == 'feminism']
df_fem[df_fem.school=='feminism'][['year','author','title']].drop_duplicates()

In [None]:
sns.barplot(y = 'author', x = 'count', data = df_fem.groupby(['author']).size().to_frame('count').reset_index())
ax.set_xlabel('Author', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('Count', fontsize=12, labelpad=10, fontweight='bold')
plt.title('Contribution across Author', fontweight = 'bold')

plt.savefig(path + '/figs/Fem_contribution_Author.jpg');

## Sentiment Analysis across Authors

In [None]:
# Create a new column to store the categorical result by divding the compound score
senti_ana = []

for score in df_fem['compound_score']:
    if score >= 0.05:
        senti_ana.append('positive')
    elif score <= -0.05 :
        senti_ana.append('negative')
    else:
        senti_ana.append('neutral')

df_fem['senti_ana'] = senti_ana

##### Plot the compound sentiment score over time with different authors

In [None]:
sns.barplot(x = 'author', y = 'count', hue = 'senti_ana', data = df_fem.groupby(['author', 'senti_ana']).size().to_frame('count').reset_index())
ax.set_xlabel('Author', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('Count', fontsize=12, labelpad=10, fontweight='bold')
plt.title('Compound sentiment score across Author', fontweight = 'bold')

plt.savefig(path + '/figs/Fem_sentiana_Author.jpg');

##### From this plot, it can be seen that the contribution to feminism of Beauvoir is much higher than that of the other two authors. Both Beauvoir and Wollstonecraft expressed positive sentiments more than negative sentiment. For all of these three authors, neutral sentiments were the least expressed.

## Emotions Analysis across Author

In [None]:
# nltk.download('punkt')

In [None]:
# Using NRCLex to get emotions of each sentence in the df_fem
# Create an empty list to store the emotion labels
emotion_labels = []

# Go Iterate through the rows of the dataframe df_fem
for i, row in df_fem.iterrows():
    
  # Get the lemmatized text
  lemmatized_text = row['lemmatized_str']

  # Create an NRCLex object for the text
  emotion = NRCLex(lemmatized_text)

  # Get the first emotion for the text
  first_emotion = emotion.top_emotions

  # Append the top emotion label to the list
  emotion_labels.append(first_emotion)


In [None]:
# Create a new list to store most related emotions
emotions = []

for emotion in emotion_labels:
  # Extract emotion from the emotion tuple
  main_emotion = emotion[0][0]

  # Append emotion to the list
  emotions.append(main_emotion)
  
df_fem['emotions'] = emotions

##### Visualize the emotions across author

In [None]:
sns.barplot(x = 'author', y = 'count', hue = 'emotions', data = df_fem.groupby(['author', 'emotions']).size().to_frame('count').reset_index())
ax.set_xlabel('Author', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('Count', fontsize=12, labelpad=10, fontweight='bold')
plt.title('Emotions Analysis across Author', fontweight = 'bold')

plt.savefig(path + '/figs/Fem_emotions_Author.jpg');

##### This plot shows that fear and trust are the most frequent emotion expressed by all of the three authors except for the sentiments of positive and negative. 

## WordCloud of 3 authors

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# write function to find the top 10 most frequent word across author
# Flatten -- Join all string except for the stop words-- count the words
def processStrings(token_string):
    return re.sub(r'[\'\[\]]', '', token_string).split(', ')

def processTokens(token_text):
    flatten_df = token_text.to_numpy().flatten()
    joined_df = ', '.join(flatten_df)
    
    filtered_sentence = []
    word_tokens = processStrings(joined_df)
    
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

def getWordCounts(filtered_sentence):
    return pd.Series(filtered_sentence).value_counts()

### WordCloud -- Beauvoir

##### The same process repeats three times to plot wordcloud for three authors

In [None]:
# Wordcloud for Beauvoir
# create an empty string to store all the sentences
all_sentences = ' '.join(df_fem[df_fem['author'] == 'Beauvoir']['sentence_str'])


wordcloud = WordCloud(max_font_size=80, width=600, height=400, background_color = 'white').generate(all_sentences)

# plot the wordcloud
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('WordCloud for Beauvoir', fontsize=14, fontweight='bold')
plt.savefig(path + "/figs/Wordcloud_Beauvoir.png")
plt.show()


In [None]:
processed_Bea = processTokens(df_fem[df_fem.author=='Beauvoir'].tokenized_txt)
Beauvoir_wcounts = getWordCounts(processed_Bea)
Bead_df = Beauvoir_wcounts.to_frame().reset_index()
Bead_df.columns=['Word','Frequency']
Bead_df[0:10]

In [None]:
sns.barplot(x = 'Frequency', y = 'Word', data = Bead_df[0:10])
ax.set_xlabel('Frequency', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('Word', fontsize=12, labelpad=10, fontweight='bold')
plt.title('Top 10 Words for Beauvoir', fontweight = 'bold')

plt.savefig(path + '/figs/Top10_Beauvoir.jpg');

### WordCloud for Davis

In [None]:
# Wordcloud for Davis
# create an empty string to store all the sentences
all_sentences = ' '.join(df_fem[df_fem['author'] == 'Davis']['sentence_str'])

wordcloud = WordCloud(max_font_size=80, width=600, height=400, background_color = 'white').generate(all_sentences)

# plot the wordcloud
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('WordCloud for Davis', fontsize=14, fontweight='bold')
plt.savefig(path + "/figs/Wordcloud_Davis.png")
plt.show()

In [None]:
processed_Dav = processTokens(df_fem[df_fem.author=='Davis'].tokenized_txt)
Davis_wcounts = getWordCounts(processed_Dav)
Dav_df = Davis_wcounts.to_frame().reset_index()
Dav_df.columns=['Word','Frequency']
Dav_df[0:10]

In [None]:
sns.barplot(x = 'Frequency', y = 'Word', data = Dav_df[0:10])
ax.set_xlabel('Frequency', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('Word', fontsize=12, labelpad=10, fontweight='bold')
plt.title('Top 10 Words for Davis', fontweight = 'bold')

plt.savefig(path + '/figs/Top10_Davis.jpg');

### WordCloud for Wollstonecraft

In [None]:
# Wordcloud for Wollstonecraft
# create an empty string to store all the sentences
all_sentences = ' '.join(df_fem[df_fem['author'] == 'Wollstonecraft']['sentence_str'])

# create a wordcloud object
wordcloud = WordCloud(max_font_size=80, width=600, height=400, background_color = 'white').generate(all_sentences)

# plot the wordcloud
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('WordCloud for Wollstonecraft', fontsize=14, fontweight='bold')
plt.savefig(path + "/figs/Wordcloud_Woll.png")
plt.show()

In [None]:
processed_Woll = processTokens(df_fem[df_fem.author=='Wollstonecraft'].tokenized_txt)
Woll_wcounts = getWordCounts(processed_Woll)
woll_df = Woll_wcounts.to_frame().reset_index()
woll_df.columns=['Word','Frequency']
woll_df[0:10]

In [None]:
sns.barplot(x = 'Frequency', y = 'Word', data = woll_df[0:10])
ax.set_xlabel('Frequency', fontsize=12, labelpad=10, fontweight='bold')
ax.set_ylabel('Word', fontsize=12, labelpad=10, fontweight='bold')
plt.title('Top 10 Words for Wollstonecraft', fontweight = 'bold')

plt.savefig(path + '/figs/Top10_Woll.jpg');

##### Look at the top 10 word for three authors at the same time to see the similarity and difference

In [None]:
fig, axes = plt.subplots(1,3, figsize=(15, 10))

axes[0].set_title('Beauvoir');
axes[1].set_title('Davis');
axes[2].set_title('Wollstonecraft');


sns.barplot(ax = axes[0], x='Frequency', y= 'Word', data = Bead_df[0:10]);
sns.barplot(ax = axes[1], x='Frequency', y= 'Word', data = Dav_df[0:10]);
sns.barplot(ax = axes[2], x='Frequency', y= 'Word', data = woll_df[0:10]);


plt.savefig(path + '/figs/Top10_3author.jpg');

##### Comparing the word frequency across author, it is common for them to use gender-related word like woman, man and their plural form. But the topic differentiates since they might focus on areas like racial problem, the gender relations in the marriage, the role of being a wife and husband in the whole society. It is the reason they express the sentiment and emotions in a different way since some topic is definitely more complicated to solve and has a long way to go.

# Future work

Some futher analysis has not been included such as topic modeling and TF-IDF.  Also, there is some limitation for sentiment analysis if only focusing on the single word. Clustering among emotions is supposed to be explored in the future