# Iranian State-Sponsored Disinformation Campaigns on Twitter 
### Data Analysis

In [2]:
import pandas as pd
import numpy as np
import os
import warnings

## Step 1: Data cleaning

#### a). Load the raw Twitter data

In [2]:
warnings.filterwarnings('ignore')
# Load each file in the data folder and add it to the same file
def load_twitter_data():
    """
    A function to load scraped news data from data folder
    """
    # List of files
    files = [f for f in os.listdir(os.path.join(os.getcwd(), "src_data")) if f.endswith(".csv")]
    
    # List of data frames
    file_list = []
    
    # Append each data frame in files to the file_list
    for filename in files:
        df = pd.read_csv(os.path.join(os.path.join(os.getcwd(), "src_data"), filename))
        file_list.append(df)
        
    # Concatenate all the news data frames
    df_full = pd.concat(file_list, join='outer').drop_duplicates().reset_index().drop(columns='index')
    
    return df_full

tweets = load_twitter_data()

#### b). Select only the variables we are interestedted in

In [3]:
tweets_clean = tweets[['user_screen_name',  'user_display_name', 'user_reported_location', 'account_language', 'tweet_language', 'tweet_text', 'tweet_time', 'urls', 'hashtags', 'is_retweet']]

tweets_clean.head(10)

Unnamed: 0,user_screen_name,user_display_name,user_reported_location,account_language,tweet_language,tweet_text,tweet_time,urls,hashtags,is_retweet
0,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,en,one person followed me // automatically checke...,2017-01-11 05:23,['http://fllwrs.com'],[],False
1,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,#IDFTerrorists\nحماسه تروریستهای اسرائیلی http...,2018-05-26 00:48,[],['IDFTerrorists'],False
2,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,en,Stop war on Yemen hospitals\n#ShameOnUN\n#Yemen,2018-06-16 20:06,[],"['ShameOnUN', 'Yemen']",False
3,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,لبیک یا فقیه\n#مجزرة_الدراز https://t.co/nKfQW...,2018-05-23 18:22,[],['مجزرة_الدراز'],False
4,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,اینجا تل ابیب است\nاینها اسراییلیهایی هستند که...,2019-01-28 16:56,[],['زندگی_سگی_اسرائیلیها'],False
5,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,ar,وامروز هم ....\n#زندگی_سگی_اسرائیلیها https://...,2018-09-07 10:42,[],['زندگی_سگی_اسرائیلیها'],False
6,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,حکمت ثابت موندن اسم ماههای قمری بعد از تغییر ز...,2017-11-19 19:40,[],[],False
7,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,جمله ای که سیدحسن امشب گفت در مورد انقلاب اسلا...,2019-02-06 18:26,[],['إن_مع_الصبر_نصرا'],False
8,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,بشنوید مدح حاج محمودآقوی کریمی رو با لهجه شیرا...,2017-08-04 12:25,['https://twitter.com/khanisadiq/status/893438...,[],False
9,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,ar,RT @awadazeinab1: كم ساعة مع ابني بالمستشفى شف...,2019-01-25 17:48,[],[],True


#### c). Filter tweets and keep those that are
- account location in Venezuela
- account language is Spanish or
- Tweet language is Spanish

In [4]:
tweets_clean = tweets_clean[(tweets_clean.user_reported_location == 'Venezuela') | (tweets_clean.account_language == 'es') | (tweets_clean.tweet_language == 'es')].reset_index().drop(columns= ['index'])

#### d.) Take out tweets that are set in European and US locations

In [5]:
tweets_clean = tweets_clean[(tweets_clean.user_reported_location != 'London') & (tweets_clean.user_reported_location != 'Manhattan, NY') & (tweets_clean.user_reported_location != 'Brooklyn, NY') & (tweets_clean.user_reported_location != 'Queens, NY') & (tweets_clean.user_reported_location != 'New York, NY') & (tweets_clean.user_reported_location != 'California, USA') & (tweets_clean.user_reported_location != 'New Jersey, USA') &  (tweets_clean.user_reported_location != 'North Holland, The Netherlands') & (tweets_clean.user_reported_location != 'Atlantic City, NJ') & (tweets_clean.user_reported_location != 'Mountain View, CA') & (tweets_clean.user_reported_location != 'New York, USA') & (tweets_clean.user_reported_location != 'Canada') & (tweets_clean.user_reported_location != 'San Francisco, CA') & (tweets_clean.user_reported_location != 'Washington, USA') & (tweets_clean.user_reported_location != 'Washington, DC') & (tweets_clean.user_reported_location != 'España') & (tweets_clean.user_reported_location != 'Germany') & (tweets_clean.user_reported_location != 'Nantes, France') & (tweets_clean.user_reported_location != 'Houston, TX') & (tweets_clean.user_reported_location != 'Texas,San Antonio') & (tweets_clean.user_reported_location != 'Chicago') & (tweets_clean.user_reported_location != 'Atlanta') & (tweets_clean.user_reported_location != 'Washington,Seattle') & (tweets_clean.user_reported_location != 'Fremont, CA') & (tweets_clean.user_reported_location != 'France') & (tweets_clean.user_reported_location != 'England, United Kingdom')  & (tweets_clean.user_reported_location != 'Oregon,Portland')  & (tweets_clean.user_reported_location !='USA')  & (tweets_clean.user_reported_location != 'Florida,Orlando') & (tweets_clean.user_reported_location != 'Califor') & (tweets_clean.user_reported_location !='California,Los Angeles') & (tweets_clean.user_reported_location !='Illinois, USA') & (tweets_clean.user_reported_location !='Arizona,phoenix') & (tweets_clean.user_reported_location !='Pennsylvania,Pittsburgh') & (tweets_clean.user_reported_location !='Pennsylvania,Philadelphia') & (tweets_clean.user_reported_location !='Dallas, TX') ]

#### e.) Quality control for Tweet Language

- Use SpaCy language identification to cross-check Twitter’s language identification. Identify language of each tweet using SpaCy.
- Exclude tweets on which SpaCy and Twitter don't agree on the language
- Spanish and English datasets consist of Tweets that were marked as either Spanish by BOTH Twitter and SpaCy or English by BOTH Twitter and SpaCy.

In [6]:
import spacy
from spacy_langdetect import LanguageDetector


# Initialize spacy with the SPANISH model
sp = spacy.load('es_core_news_sm')
sp.add_pipe(LanguageDetector(), name = 'language_detector', last = True)
eng = spacy.load('en_core_web_sm')

In [None]:
def detect_language(df, content_column):
    '''
    A function to detect the language in each tweet and add to new row

    Argument: a dataframe  and content column
    Ouput: same dataframe with a new 'cleaned_content' column
    '''

    # Initialize list of languages
    spacy_language_detection = []

    # Call detect the language for each row in the data frame and append to spacy_language_detection list
    for row in df[content_column]:
        doc = sp(row)
        spacy_language_detection.append(doc._.language['language'])

    # Append language list to the data frame
    df['spacy_language_detection'] = spacy_language_detection

    return df 

tweets_clean = detect_language(df = tweets_clean, content_column = 'tweet_text')

# Isolate tweets marked as Spanish by Twitter AND SpaCy 
spanish_tweets = tweets_clean[(tweets_clean.tweet_language == 'es') & (tweets_clean.spacy_language_detection == 'es')]


# Isolate tweets marked as English by Twitter AND SpaCy 
english_tweets = tweets_clean[(tweets_clean.tweet_language == 'en') & (tweets_clean.spacy_language_detection == 'en')]

In [8]:
spanish_tweets = pd.read_csv('proj_data/spanish_tweets.csv', encoding='utf-8-sig', index_col = 0).reset_index()
english_tweets = pd.read_csv('proj_data/english_tweets.csv', encoding='utf-8-sig', index_col = 0).reset_index()
english_tweets.lemmatized_tweet_text = english_tweets.lemmatized_tweet_text.astype('str')
spanish_tweets.lemmatized_tweet_text = spanish_tweets.lemmatized_tweet_text.astype('str')

# 2. Keyword analysis

### a.) Lemmatize and simplify Tweet text
- “Clean” each Tweet by removing stop words (very common words that carry little meaning), non alpha-numeric characters, and by reducing each word to it’s lemma or root.

In [7]:
def clean_string(text_string, language):
    '''
    A function to clean a string using SpaCy, removing stop-words and non-alphanumeric characters

    Argument: a text string and a language ('English' or 'Spanish')
    Output: a cleaned string

    '''
    if language == 'Spanish':
    # Parse the text string using the english model initialized earlier
        doc = sp(text_string)
    elif language == 'English':
        doc = eng(text_string)
    
    # Initialize empty string
    clean = []

    # Add each token to the list if it is not a stop word, is alphanumeric, and if it's not a pronoun
    for token in doc:
        
        if token.is_alpha == False or token.is_stop == True:
            pass
        else:
            clean.append(token.lemma_)

    # Join the list into a string
    clean = " ".join(clean)

    return clean

### An example of how the cleaning works:

In [12]:
example_sp = spanish_tweets.loc[2,'tweet_text']
example_sp_clean = clean_string(example_sp, "Spanish")
print("Raw example: \n" + example_sp)
print("\n\nClean exmaple: \n" + example_sp_clean)

Raw example: 
RT @Algbarow55: Solo en #Yemen los padres pierden a sus hijos; Ni√±os pierden a sus padres #SaudiBombsChildren https://t.co/uImWl8QG5u


Clean exmaple: 
RT Solo Yemen padre perder a hijo perder a padre SaudiBombsChildren


In [None]:
def clean_content(df, content_column, language):
    '''
    A function to clean all the strings in a whole of a corpus

    Argument: a dataframe, the name of the column with the content, and a language ('Spanish' or 'English')
    Ouput: same dataframe with a new cleaned content column
    '''

    # Initialize list of cleaned content strings
    clean_content= []

    # Call clean_string() for each row in the data frame and append to clean_content list
    for row in df[content_column]:
        clean_content.append(clean_string(row, language))

    # Append clean_content list to the data frame
    df['lemmatized_tweet_text'] = clean_content

    return df 

spanish_tweets = clean_content(spanish_tweets, 'tweet_text', 'Spanish')

english_tweets = clean_content(english_tweets, 'tweet_text', 'English')

### b.) Create word counts

Use Scikit-Learn CountVectorizer create a matrix where each column is a word found in any Tweet and each row is the number of times it occurs in each Tweet.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
# only words that appear in more than 5 tweets (a way to decrease the size of the vocab)
word_vectorizer = CountVectorizer(encoding='utf-8-sig', analyzer='word', min_df=5, ngram_range=(1,1))
# create matrix where each column is a word and each row is a count in each tweet
word_count_sm = word_vectorizer.fit_transform(spanish_tweets['lemmatized_tweet_text'])
words = word_vectorizer.get_feature_names()

# sum the count of each word over all Tweets
word_count_total = word_count_sm.sum(axis=0)
word_count_total_df = pd.DataFrame(word_count_total, columns = words)

In [11]:
word_count_total_df

Unnamed: 0,abadi,abajar,abajo,abandonar,abatir,abbas,abby,abc,abdel,abdullah,...,äúpor,äúsegundo,äúsionistas,äútsunami,äúun,äúyo,òë,òï,ùå,üí
0,18,19,8,79,50,15,6,5,17,10,...,5,6,6,18,15,11,6,13,26,14


### c.) Phrase counts

Change the number of words analyzed with the same type of vectorizer

In [15]:
# only phrases that appear in more than 5 tweets (a way to decrease the size of the vocab)
phrase_vectorizer = CountVectorizer(encoding='utf-8-sig', analyzer='word', min_df=5, ngram_range=(5,7))
# create matrix where each column is a phrase and each row is a count in each tweet
phrase_count_sm = phrase_vectorizer.fit_transform(spanish_tweets['lemmatized_tweet_text'])
phrases = phrase_vectorizer.get_feature_names()
# sum the count of each phrase over all Tweets
phrase_count_total = phrase_count_sm.sum(axis=0)
phrase_count_total_df = pd.DataFrame(phrase_count_total, columns = phrases)

In [17]:
phrase_count_total_df

Unnamed: 0,abby martin exponer fascista colonial,abby martin exponer fascista colonial israel,abby martin exponer fascista colonial israel sionismo,abdel rahman hussein abu hmash,abdel rahman hussein abu hmash palestino,abdel rahman hussein abu hmash palestino asesinar,abogado hermann el pdte llamar,abogado hermann el pdte llamar reflexionar,abogado hermann el pdte llamar reflexionar pensar,abogados casar narvarte objetivo malware,...,äúes fuerza seguridad estado entrar,äúes fuerza seguridad estado entrar perseguir,äúgobernadores adversar gobierno renunciar resignarse,äúgobernadores adversar gobierno renunciar resignarse titular,äúno tarea segundo importante general,äúno tarea segundo importante general promover,äúno tarea segundo importante general promover proteger,äúyo desear segundo formar rico,äúyo desear segundo formar rico segundo,äúyo desear segundo formar rico segundo grande
0,6,6,6,10,10,10,14,14,14,5,...,6,6,7,7,8,8,8,6,6,6


### d.) Keyword analysis among Tweets that mention Venezuela or Venezolano/a

In [18]:
# Isolate Tweets that mention Venezuela or Venezolano/a
spanish_tweets_venezuela = spanish_tweets[spanish_tweets.tweet_text.str.contains("venez")]

# 3. Hashtag Analysis

In [19]:
# Remove parens and filter hashtags
hashtags = []
for hashtag in spanish_tweets.hashtags:
    if type(hashtag) == float:
        hashtags.append('None')
    elif len(hashtag) < 3:
        hashtags.append('None')
    else:
        hashtags.append(hashtag[1:-1])
        
# Add filtered hashtags back to data frame
spanish_tweets.hashtags = hashtags

# Spanish hashtag count and save 
hashtag_vectorizer = CountVectorizer(encoding='utf-8-sig', analyzer='word', ngram_range=(1,1))
hashtag_count_sm = hashtag_vectorizer.fit_transform(spanish_tweets['hashtags'])
hashtags = hashtag_vectorizer.get_feature_names()
hashtag_total = hashtag_count_sm.sum(axis = 0)
hashtag_count_df = pd.DataFrame(hashtag_total, columns= hashtags)

In [20]:
hashtag_count_df

Unnamed: 0,01jul,02ago,02feb,03sep,04ago,04nov,06agos,06agosto,09may,09sep,...,ÿäÿü,ÿäÿü_ÿßÿñÿ,ÿæÿ,ÿé_ÿπÿüÿø_ÿßÿµÿøÿçÿßÿäÿé,ÿñÿçÿ,ÿø,ÿøÿäÿ,ÿü,ÿü_ÿ,ÿüÿàÿßÿ
0,2,4,9,8,2,2,2,2,1,5,...,2,2,4,2,2,6,2,1,3,1


# 4. Link Farming Analysis

What proportion of Tweets follow the typical link farming pattern

In [3]:
spanish_tweets = pd.read_csv('proj_data/spanish_tweets.csv', encoding='utf-8-sig', index_col = 0).reset_index()

In [24]:
# Replace tweets without link with Null value
spanish_tweets.urls = spanish_tweets.urls.replace({'[]': np.nan})


tweets_with_link = 0
tweets_wout_link = 0

for url in spanish_tweets.urls:
    if pd.isnull(url) == True:
        tweets_with_link +=1
    else:
        tweets_wout_link += 1
        
print("There are " + str(tweets_with_link) + " Tweets with links")
print("There are " + str(tweets_wout_link) + " Tweets without links")

There are 27677 Tweets with links
There are 19215 Tweets without links


In [34]:
spanish_tweets_url  = spanish_tweets[pd.notnull(spanish_tweets.urls)]
spanish_tweets_url_at = spanish_tweets_url[spanish_tweets_url.tweet_text.str.contains('@')]
print(str(len(spanish_tweets_url_at)) + " of the Tweets with links mention other accounts")

13858 of the Tweets with links mention other accounts


In [35]:
spanish_tweets_url_at_rt = spanish_tweets_url_at[spanish_tweets_url_at.tweet_text.str.startswith('RT')]
print(str(len(spanish_tweets_url_at_rt)) + " of the Tweets with links are retweets that mention other accounts")

9716 of the Tweets with links are retweets that mention other accounts


# - Duplicates

In [4]:
spanish_tweets_no_content_copy = spanish_tweets.tweet_text.drop_duplicates()

In [6]:
print("There are " + str(len(spanish_tweets)) + " total tweets")

There are 46892 total tweets


In [11]:
print("There are " + str(len(spanish_tweets_no_content_copy)) + " unique tweets; therefore, " + str(len(spanish_tweets)- len(spanish_tweets_no_content_copy)) + " are duplicates.") 

There are 32014 unique tweets; therefore, 14878 are duplicates.


# 5. Date analysis

In [12]:
tweet_date = []
for date in spanish_tweets.tweet_time:
    tweet_date.append(str(date)[:-5].strip())

spanish_tweets['tweet_date'] = tweet_date

In [22]:
tweets_per_day = spanish_tweets.groupby('tweet_date')[['tweet_text']].count()
tweets_per_day.to_csv('proj_data/tweets_per_day.csv', encoding='utf-8-sig')

In [20]:
tweets_per_day

Unnamed: 0_level_0,tweet_text
tweet_date,Unnamed: 1_level_1
,2
1/1/16,53
1/1/17,39
1/1/18,68
1/10/16,6
...,...
9/7/17,99
9/8/16,8
9/8/17,65
9/9/16,18
