# Step 3: Conducting Analysis

Now, we will perform the final step of this project: conducting our analysis!

In [None]:
from google.colab import drive
drive.mount('/content/drive')

First, we have to upload our datasets and convert them to a Dataset object format.

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

! kaggle datasets download adhok93/inauguration-and-womensmarch-tweets
! unzip inauguration-and-womensmarch-tweets.zip

! kaggle datasets download prathamsharma123/farmers-protest-tweets-dataset-csv
! unzip farmers-protest-tweets-dataset-csv.zip

! pip install transformers datasets
! pip install keybert

In [None]:
import pandas as pd
from datasets import Dataset

womenmarch = pd.read_csv("womenmarch.csv",encoding='ISO-8859-1')
womenmarch = womenmarch.to_dict()
for col in womenmarch:
  womenmarch[col] = list(womenmarch[col].values())
womenmarch = Dataset.from_pandas(pd.DataFrame(data=womenmarch))

farmers = pd.read_csv("tweets.csv",encoding='ISO-8859-1')
farmers['date'] = pd.to_datetime(farmers['date'])
farmers.sort_values(by='date', inplace = True)
farmers = farmers.to_dict()
for col in farmers:
  farmers[col] = list(farmers[col].values())
farmers = Dataset.from_pandas(pd.DataFrame(data=farmers))

We also have to load our fine-tuned model and a KeyBERT model.

In [None]:
from transformers import AutoModelForSequenceClassification
from keybert import KeyBERT

sentiment_model = AutoModelForSequenceClassification.from_pretrained('mayapapaya/Sentiment-Analyzer')
keyword_model = KeyBERT()

Now, we can start conducting our analysis!


---

# Analysis

In this analysis, I am creating three different functions, each with their own task: public sentiment, public sentiment within a certain time period, and keyword extracting.


### Function 1: Public Sentiment
This function is to determine the majority sentiment towards the movement.

*Parameters: dataset, tweet_index*

*Return Value: String that specifies the sentiment (positive, negative, or neutral)*

In [None]:
from transformers import pipeline

def public_sentiment(dataset, tweet_index):
  positive, negative, neutral = 0, 0, 0

  # Input the tweet text into the model
  generator = pipeline('sentiment-analysis',
                    model=sentiment_model,
                    tokenizer='bert-base-uncased')

  # Determine the tweet text's sentiment
  for row in dataset:
    sentiment = generator(row[tweet_index], max_length=512, truncation=True)
    if sentiment[0]['label'] == "LABEL_2":
      positive += 1
    if sentiment[0]['label'] == "LABEL_1":
      neutral += 1
    if sentiment[0]['label'] == "LABEL_0":
      negative += 1

  """
  positives = [sentiment for sentiment in sentiments if sentiment[0]['label'] == "LABEL_2"]
  neutrals = [sentiment for sentiment in sentiments if sentiment[0]['label'] == "LABEL_1"]
  negatives = [sentiment for sentiment in sentiments if sentiment[0]['label'] == "LABEL_0"]

  positive = len(positives)
  negative = len(negatives)
  neutral = len(neutrals)
  """

  # Return the percentages of each sentiment and the total number of tweets
  return [float(positive/len(dataset)) * 100, float(negative/len(dataset)) * 100, float(neutral/len(dataset)) * 100, len(dataset)]

### Function 2: Public Sentiment within a Certain Time Period

This function is to determine whether a certain protest (for example, a riot) affects how the movement is viewed.

*Parameters: start_date, end_date (both are datetime objects), dataset, tweet_index, time_index*

*Return Value: String that specifies the sentiment (positive, negative, or neutral)*

In [None]:
from datetime import datetime
from datasets import Dataset

def public_sentiment_time(start_date, end_date, dataset, tweet_index, time_index):

  start_date = datetime.strptime(start_date, '%Y-%m-%d %H:%M:%S%z')
  end_date = datetime.strptime(end_date, '%Y-%m-%d %H:%M:%S%z')

  # Checking if dates exist within dataset
  if not(start_date >= dataset[0][time_index] or end_date <= dataset[-1][time_index]):
    print("Date does not exist within dataset")
    return None

  # Segment dataset into the specified time period
  start_date_index = 0
  end_date_index = len(dataset) - 1

  for i, row in enumerate(dataset):
    if (i+1) != len(dataset):
      if dataset[i][time_index] <= start_date < dataset[i+1][time_index]:
        start_date_index = i+1
      elif dataset[i][time_index] < end_date <= dataset[i+1][time_index]:
        end_date_index = i

  print(dataset[start_date_index])
  print(dataset[end_date_index])

  seg_dataset = dataset[start_date_index:end_date_index]
  seg_dataset = Dataset.from_dict(seg_dataset)

  return public_sentiment(seg_dataset, tweet_index)

### Function 3: Keyword Extractor

This function is to determine whether the issue itself is being spread, or whether the protester’s actions distract from that.

*Parameters: keywords (a list that holds all the keyword variations considered central to the movement (e.g. abortion keywords --> “abortion”, “reproductive rights”, “pro-life”, “pro-choice", etc.), dataset*

*Return Value: dictionary where keys are keywords and values are the percentages they appeared*

In [None]:
def keyword_extractor(dataset, keywords):
  related = 0

  for tweet in dataset:
    # Input the tweet text into the model
    tweet_keywords = keyword_model.extract_keywords(tweet, keyphrase_ngram_range=(1, 2))
    tweet_keywords = [x[0] for x in tweet_keywords]

    # See if the tweet's keywords are related to the issue
    for tweet_keyword in tweet_keywords:
      if tweet_keyword in keywords:
        related += 1

  return [related/len(dataset)*100, len(dataset)]

Now let's analyze our datasets using these functions!

## WomensMarch Analysis

We'll start with the #WomensMarch dataset:

In [None]:
print("Public Sentiment:")
print(public_sentiment(womenmarch, 'text'))



```
# Func 1 Output:

Positive Tweets: 37.00666666666667 %
Negative Tweets: 18.593333333333334 %
Neutral Tweets: 44.4 %

Total Tweets: 15,000
```



In [None]:
print("Keyword Usage: ")
print(keyword_extractor(womenmarch['text'], ["riot", "nuisance", "traffic", "problem"]))

```
# Func 3 Output:

Keywords: "riot", "nuisance", "traffic", "problem"
Percentage: 0.0 %
```



## The Indian Farmers' Protest Analysis

Now, we'll look at our second dataset:

In [None]:
print("Public Sentiment:")
print(public_sentiment(farmers, 'renderedContent'))



```
# Func 1 Output:

Positive Tweets: 24.312463806604626 %
Negative Tweets: 34.93026892845419 %
Neutral Tweets: 40.75726726494119 %

Total Tweets: 1,084,452
```



In [None]:
start_date = "2020-11-29 00:00:00+00:00"
end_date = "2021-03-22 00:00:00+00:00"
print("Public Sentiment between " + start_date + " and " + end_date + ": ")
print(public_sentiment_time(start_date, end_date, farmers, 'renderedContent', 'date'))



```
# Func 2 Output Between 11-29-2020 and 03-22-2021:

Positive Tweets: 25.4336761614079 %
Negative Tweets: 34.31111038955768 %
Neutral Tweets: 40.25521344903443 %

Total Tweets: 615,955
```



In [None]:
start_date = "2021-11-05 00:00:00+00:00"
end_date = "2021-11-21 00:00:00+00:00"
print("Public Sentiment between " + start_date + " and " + end_date + ": ")
print(public_sentiment_time(start_date, end_date, farmers, 'renderedContent', 'date'))



```
# Func 2 Output Between 11-05-2021 and 11-21-2021::

Positive Tweets: 27.842227378190255 %
Negative Tweets: 33.75259494443766 %
Neutral Tweets: 38.40517767737208 %

Total Tweets: 24,567
```



In [None]:
start_date = "2021-01-26 00:00:00+00:00"
end_date = "2021-02-01 00:00:00+00:00"
print("Public Sentiment between " + start_date + " and " + end_date + ": ")
print(public_sentiment_time(start_date, end_date, farmers, 'renderedContent', 'date'))



```
# Func 2 Output Between 01-26-2021 and 02-01-2021:

Positive Tweets: 13.413878562577446 %
Negative Tweets: 48.213548120611314 %
Neutral Tweets: 38.372573316811234 %

Total Tweets: 38,736

```



In [None]:
print("Keyword Usage: ")
print(keyword_extractor(farmers['renderedContent'], ["riot", "nuisance", "traffic", "problem"]))

```
# Func 3 Output:
```



And we have now finished conducting our analysis!