In this notebook you will find:
- Key words extraction
- Polarization Analysis
- Emotion Detection
- NER Analysis

## Libraries

In [None]:
! pip install seaborn
! pip install textblob
! pip install spacy

In [None]:
import pandas as pd
import nltk
import lzma
import matplotlib.pyplot as plt
from nltk import pos_tag, word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
from nltk.sentiment import SentimentIntensityAnalyzer
from nrclex import NRCLex
import plotly.express as px
import seaborn as sns
import re 
from textblob import TextBlob
import spacy

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')

Change here your directory path where the dataframes are stored

In [4]:
directory_path = "C:/Users/Paola/OneDrive/Desktop/Data Science/2 ANNO/Social media analytics/Progetto/"

## Key Words

In [None]:
def extract_key_words(text): 
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tagged_words = pos_tag(tokens)
    key_words = [word.lower() for word, pos in tagged_words if pos.startswith('N') or pos.startswith('J')]
    return key_words

for year in range(2007, 2023):
    file_path = f"{directory_path}politics_comments_{year}.lzma"
    
    try:
        with lzma.open(file_path, 'rb') as file:
            df = pd.read_pickle(file)

        ## change number of rows
        selected_rows = df.nlargest(10000, 'score')

        # Concatenate all comments into a single string
        comments_text = ' '.join(selected_rows['body'].dropna())

        # Stop-words removal
        stop_words = set(stopwords.words('english'))
        words = [word.lower() for word in comments_text.split() if word.lower() not in stop_words]

        key_words = extract_key_words(comments_text)

        # Generate WordCloud
        wordcloud = WordCloud(width=800, height=400, max_words=200, background_color='white').generate(' '.join(key_words))
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f"Word Cloud for {year}")
        plt.axis('off')
        plt.show()

        # Top 20 key words
        key_word_counts = Counter(key_words)
        top_key_words = key_word_counts.most_common(20)
        print(f"Top 20 Key Words for {year}:")
        for word, count in top_key_words:
            print(f"{word}: {count}")

    except Exception as e:
        print(f"Error loading {file_path}: {e}")


## Polarization

In [None]:
def get_political_sentiment(comment):
    analysis = TextBlob(comment)
    return analysis.sentiment.polarity

for year in range(2007, 2023):
    try:
        file_path = f"{directory_path}politics_comments_{year}.lzma"

        with lzma.open(file_path, 'rb') as file:
            df = pd.read_pickle(file)

        ## change number of rows
        top_comments = df.nlargest(10000, 'score')

        stop_words = set(stopwords.words('english'))
        top_comments['body'] = top_comments['body'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))

        top_comments['political_sentiment'] = top_comments['body'].apply(get_political_sentiment)

        # Crea un barplot
        plt.figure(figsize=(10, 5))
        sns.histplot(top_comments['political_sentiment'], bins=20, kde=False, color='skyblue', edgecolor='black')
        plt.title(f'Barplot of Political Sentiment for {year}')
        plt.xlabel('Political Sentiment Score')
        plt.ylabel('Count')
        plt.show()

    except Exception as e:
        print(f"Errore durante l'analisi dell'anno {year}: {e}")

In [None]:
for year in range(2007, 2023):
    try:
        file_path = f"{directory_path}connections_{year}.lzma"

        with lzma.open(file_path, 'rb') as file:
            df = pd.read_pickle(file)

        # Crea un barplot
        plt.figure(figsize=(10, 5))
        sns.histplot(df['sentiment'], bins=20, kde=False, color='skyblue', edgecolor='black')
        plt.title(f'Barplot of Sentiment for {year}')
        plt.xlabel('Sentiment Score')
        plt.ylabel('Count')
        plt.show()

    except Exception as e:
        print(f"Errore durante l'analisi dell'anno {year}: {e}")

## Emotion Detection

In [None]:
def get_nrclex(df):
  text_object = NRCLex(' '.join(df['body']))
  return text_object

def get_emofreq(text_object):
  print(text_object.affect_frequencies)
  print('\nThe top emotion is:')
  print(text_object.top_emotions)

def get_sentimentscores(text_object):
  sentiment_scores = pd.DataFrame(list(text_object.raw_emotion_scores.items()))
  sentiment_scores = sentiment_scores.rename(columns={0: "Sentiment", 1: "Count"})
  print(sentiment_scores)
  print('\n')

  fig = px.pie(sentiment_scores, values='Count', names='Sentiment',
             title='Sentiment Scores',
             hover_data=['Sentiment'])
  fig.update_traces(textposition='inside', textinfo='percent+label')
  fig.show()

for year in range(2007, 2023):
    try:
        file_path = f"{directory_path}politics_comments_{year}.lzma"

        with lzma.open(file_path, 'rb') as file:
            df = pd.read_pickle(file)

        ## change number of rows
        top_comments = df.nlargest(10000, 'score')

        print(f"Dataframe top_comments per l'anno {year}:")
        print(top_comments.head())

        text_object = get_nrclex(top_comments)
        get_emofreq(text_object)
        get_sentimentscores(text_object)

    except Exception as e:
        print(f"Errore durante l'analisi dell'anno {year}: {e}")

## NER

Esegui (se necessario) nel prompt dei comandi --> python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load("en_core_web_sm")

def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['PERSON', 'ORG']]
    return entities

for year in range(2007, 2010):
    try:
        file_path = f"{directory_path}politics_comments_{year}.lzma"

        with lzma.open(file_path, 'rb') as file:
            df = pd.read_pickle(file)

        ## change number of rows
        top_comments = df.nlargest(10000, 'score')

        top_comments['ner_results'] = top_comments['body'].apply(perform_ner)

        all_entities = [entity for entities_list in top_comments['ner_results'] for entity, label in entities_list]
        entity_counts = Counter(all_entities)
        top_n = 10
        top_entities = entity_counts.most_common(top_n)

        print(f"Named Entity Recognition Results for the top 1000 comments in {year}:")
        print(top_comments[['body', 'ner_results']].head(10))
        print("\n")

        plt.figure(figsize=(10, 6))
        plt.bar([entity[0] for entity in top_entities], [entity[1] for entity in top_entities])
        plt.xlabel('Entity')
        plt.ylabel('Count')
        plt.title(f'Top {top_n} Named Entities for {year}')
        plt.xticks(rotation=45, ha='right')
        plt.show()

        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(all_entities))

        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Word Cloud for Named Entities (PERSON and ORG) - {year}')
        plt.show()

    except Exception as e:
        print(f"Errore durante l'analisi NER dell'anno {year}: {e}")