[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12s3800zmkJdQQdPqHDgym8BT8hle9Dtz)

# Install required packages

In [None]:
!pip install kaggle 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 2.6 MB/s 
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=cfad9b8294e78a6dd3489299067d345278d326b23afd39a54ace003a6f2873f6
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.5.12


In [7]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 42 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 51.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=c0ad5649e8df2f84a09c0acef3e548da6a7c361e912c52717725e7028474f2eb
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


# Data import

In [1]:
import pandas as pd
import numpy as np
import csv
import os
from zipfile import ZipFile
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

#For string manipulation
import string
import re
punct = set(string.punctuation) 

#StopWords
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

#Lemmatizer 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

#Tokenizer
#from nltk.tokenize import word_tokenize
#nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [2]:
os.environ["KAGGLE_USERNAME"] = "marcocazzola"
os.environ["KAGGLE_KEY"] = "2c598ba6fcb8fe02fc9d42b1dd44224b"

#4th of April -> Bucha massacre discover
!kaggle datasets download bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows -f UkraineWar/UkraineWar/0404_UkraineCombinedTweetsDeduped.csv.gzip 

Downloading 0404_UkraineCombinedTweetsDeduped.csv.gzip.zip to /content
 98% 92.0M/94.2M [00:02<00:00, 45.6MB/s]
100% 94.2M/94.2M [00:02<00:00, 48.6MB/s]


In [3]:
#Read the dataframe and consider only english tweets
with ZipFile('/content/0404_UkraineCombinedTweetsDeduped.csv.gzip.zip', 'r') as zip_object:
    zip_object.extractall()

df = pd.read_csv("/content/0404_UkraineCombinedTweetsDeduped.csv.gzip", 
                 compression='gzip', index_col=0, encoding='utf-8', 
                 quoting=csv.QUOTE_ALL)
df = df[df.language == 'en']

#A sample of 5000 unique english tweets
tweets = pd.Series(df.text.unique()).sample(n = 5000, random_state=42)
tweets

  exec(code_obj, self.user_global_ns, self.user_ns)


60582    Nah 🙄\n\nRussia is trying HARD to spread FAKES...
12890    My hope is for peace amongst all people and a ...
64511    @HillReporter @MeidasTouch #BREAKING: Judge Ke...
4097     There is a moment when cowardly is an excuse f...
40919    ‼️ Ombudsman Lyudmila Denisova called the pris...
                               ...                        
5718     It’s rare that any EU country would ask anythi...
20322    Epoch of the boneless politics… @UN @UNHumanRi...
28375                   #Bucha don’t forget, don’t forgive
8171     Family says Russians kidnapped this Ukrainian ...
52865    #Zelenskyy  describing the #AzovBattalion #AZO...
Length: 5000, dtype: object

# Preprocessing

In [4]:
#Function to be applied to preprocess each tweet for cleaning purposes

def cleaning_txt(tweet): 

  #1. Lowercase everything
  res = tweet.lower()

  #2. Remove URLs
  res = re.sub("https?:\/\/.*[\r\n]*", "", res)\

  #3. Remove punctuation 
  res = "".join([ch for ch in res if ch not in punct])

  #4. After point 3, "&amp" (standing for "&") has become "amp", so let us substitute any " amp " with " and "
  #   Moreover, let us also remove any "strange" utf code and emoticons
  res = re.sub(" amp ", " and ", res).encode("ascii", "ignore").decode().replace("\n", " ")

  #5. Tokenize
  res = res.split()

  #6. Remove stopwords
  res = [w for w in res if w not in stop_words]

  #7. Lemmatization
  res = [lemmatizer.lemmatize(w) for w in res]

  #8. Just unique words inside a sentence
  res = list(set(res))

  return(res)

In [8]:
#creating the spark context
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [9]:
#parallelizing original data
rdd = sc.parallelize(tweets)

In [10]:
#clean the parallelized data
rdd = rdd.map(cleaning_txt)

In [11]:
#Nr of baskets
n = len(tweets)

#List of unique words
unique_ws = rdd.flatMap(lambda txt: txt)\
.map(lambda w: (w, 1))\
.reduceByKey(lambda w1, w2: 1)\
.map(lambda x: x[0])\
.collect()
  
#Dict of the type(word : int)
unique_wsk = dict((w, i) for i,w in enumerate(unique_ws, 1)) 
#Dict of the type(int : word)
unique_wsv = dict((i, w) for i,w in enumerate(unique_ws, 1)) 

#Algorithm implementation

The goal of the project is to find *frequent* itemsets. 

By frequent we mean itemsets with a frequency higher than a given threshold. 

In our case, we will consider as frequent any item or itemset appearing at least in 2% of the tweets. 

In [12]:
def toWords(x, w_dict): 
  if type(x) == int: 
    return( w_dict[x] )
  else: 
    return( (tuple([w_dict[i] for i in x])) )

In [13]:
def candidateFrequentSets(sent, freq_only): 

  if type(freq_only[0]) == int: 
    cand_set = [tuple(sorted(t)) for t in list(combinations(sent, 2))]
    res = []

    for cand in cand_set: 
      filtr = [w for w in cand]
      if all( [sub in freq_only for sub in filtr] ):
        res.append((cand, 1))
      else: 
        continue

  else: 
    km1 = len(freq_only[0])
    cand_set = [tuple(sorted(t)) for t in list(combinations(sent, km1 + 1))]
    res = []

    for cand in cand_set: 
      filtr = [tuple(sorted(t)) for t in list(combinations(cand, km1))]
      if all( [sub in freq_only for sub in filtr] ): 
        res.append((cand, 1))
      else: 
        continue

  return(res)

In [14]:
def apriori(rdd, i_dict, n, k=2, s=0.02): 

  counts = [None] * k
  freq_only = [None] * k

  #tweets to integers
  sent_int = rdd.map(lambda x: [i_dict[w] for w in x])

  counts[0] = sent_int.flatMap(lambda sent: sent)\
  .map(lambda w: (w, 1))\
  .reduceByKey(lambda w1, w2: w1+w2)\
  .filter(lambda x: x[1] > s*n)

  freq_only[0] = counts[0]\
  .map(lambda x: x[0])\
  .collect()

  if k == 1: 
    return (counts) 

  else: 
    curr_k = 2
    
    while curr_k <= k:
      counts[curr_k - 1] = sent_int.filter(lambda x: len(x) >= curr_k)\
      .flatMap(lambda x: candidateFrequentSets(x, freq_only[curr_k - 2]))\
      .reduceByKey(lambda t1, t2: t1+t2)\
      .filter(lambda x: x[1] > s*n)
      
      freq_only[curr_k - 1] = counts[curr_k - 1]\
      .map(lambda x: x[0])\
      .collect()

      if len(freq_only[curr_k - 1]) == 0: 
        return("No frequent itemsets of such size.")

      elif curr_k == k:
        return (counts) 

      else:
        curr_k += 1 
  

In [15]:
#Generate the count of frequent itemsets
counts = apriori(rdd, unique_wsk, n, k=3)

In [16]:
def getWordsAndFreq(counts, w_dict, k):

  res = counts[k-1].map(lambda x: (toWords(x[0], w_dict), x[1]))\
  .sortBy(lambda x: -x[1])\
  .collect()

  return(dict(res))

In [17]:
getWordsAndFreq(counts, unique_wsv, k=3)

{('russia', 'ukraine', 'russian'): 197,
 ('putin', 'russia', 'ukraine'): 197,
 ('russia', 'ukraine', 'war'): 190,
 ('putin', 'ukraine', 'war'): 148,
 ('ukraine', 'russian', 'war'): 128,
 ('ukraine', 'crime', 'war'): 120,
 ('bucha', 'russia', 'ukraine'): 120,
 ('bucha', 'ukraine', 'russian'): 103,
 ('putin', 'ukraine', 'russian'): 103,
 ('ukrainian', 'ukraine', 'russian'): 101}

# Confidence and interest

In [18]:
def confAndIntr(counts, n, w_dict, k):

  big_sets = counts[k-1].map(lambda x: x[0]).collect()

  conf = {}
  intr = {}

  for big_set in big_sets: 
    if k == 2:
      Is = [i for i in big_set]
    else:
      Is = [tuple(sorted(t)) for t in list(combinations(big_set, k-1))]

    js = [int(np.setdiff1d(big_set, t)[0]) for t in Is]

    freq_big_set = counts[k-1].filter(lambda x: x[0] == big_set)\
    .map(lambda x: x[1])\
    .collect()[0]

    for idx in range(len(Is)): 
      I = Is[idx]
      j = js[idx]

      freq_I = counts[k-2].filter(lambda x: x[0] == I)\
      .map(lambda x: x[1])\
      .collect()[0]

      freq_j = counts[0].filter(lambda x: x[0] == j)\
      .map(lambda x: x[1])\
      .collect()[0]

      conf[(toWords(I, w_dict), toWords(j, w_dict))] = freq_big_set / freq_I
      intr[(toWords(I, w_dict), toWords(j, w_dict))] = (freq_big_set / freq_I) - \
                                                       (freq_j / n)

  #Sorting the dicts
  conf = dict(sorted(conf.items(), key= lambda x: -x[1]))
  intr = dict(sorted(intr.items(), key= lambda x: -x[1]))

  return(conf, intr)

In [22]:
conf, intr = confAndIntr(counts, n, unique_wsv, k=3)

In [23]:
conf_s = pd.Series(conf).reset_index().rename(columns={'level_0' : 'I', 'level_1' : 'j', 0 : 'Confidence'})
intr_s = pd.Series(intr).reset_index().rename(columns={'level_0' : 'I', 'level_1' : 'j', 0 : 'Interest'})

In [24]:
pd.merge(conf_s, intr_s, how='outer', on=['I', 'j']).sort_values(by=['Interest', 'Confidence'], ascending=False)

Unnamed: 0,I,j,Confidence,Interest
0,"(ukraine, crime)",war,0.821918,0.665118
1,"(russia, war)",ukraine,0.745098,0.242098
9,"(putin, ukraine)",russia,0.461358,0.196558
15,"(putin, ukraine)",war,0.346604,0.189804
2,"(russian, war)",ukraine,0.684492,0.181492
23,"(ukraine, war)",crime,0.233463,0.179663
3,"(russia, russian)",ukraine,0.6633,0.1603
4,"(bucha, russia)",ukraine,0.655738,0.152738
13,"(ukrainian, ukraine)",russian,0.367273,0.145473
5,"(putin, russia)",ukraine,0.623418,0.120418
