# Disinformation on twitter
Iranian state-sponsored campaigns aimed at Venezuela


## Step 1: Data Cleaning
#### This chunk of code just imports some programs:

In [2]:
import pandas as pd
import os

#### This lets the program know which is the current folder and which is the folder with the data in it

In [2]:
PROJ_ROOT_DIR = os.getcwd()

DATA_PATH = os.path.join(PROJ_ROOT_DIR, "csv")
if not os.path.isdir(DATA_PATH):  
    os.makedirs(DATA_PATH)

#### a.) Combine all the data files 

In [None]:
def load_twitter_data():
    """
    A function to load data from data folder
    """
    # List of files
    files = [f for f in os.listdir(DATA_PATH) if f.endswith(".csv")]
    
    # List of data frames
    file_list = []
    
    # Append each data frame in files to the file_list
    for filename in files:
        df = pd.read_csv(os.path.join(DATA_PATH, filename), low_memory=False)
        file_list.append(df)
        
    # Concatenate all the news data frames
    df_full = pd.concat(file_list, join='outer', sort = True).drop_duplicates().reset_index().drop(columns='index')
    
    return df_full

tweets = load_twitter_data()

#### b.) Select only the columns we are interested in

In [5]:
tweets_clean = tweets[['user_screen_name',  'user_display_name', 'user_reported_location', 'account_language', 'tweet_language', 'tweet_text', 'tweet_time', 'urls', 'hashtags', 'is_retweet']]

# Top 10 rows of data
tweets_clean.head(10)

Unnamed: 0,user_screen_name,user_display_name,user_reported_location,account_language,tweet_language,tweet_text,tweet_time,urls,hashtags,is_retweet
0,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,en,one person followed me // automatically checke...,2017-01-11 05:23,['http://fllwrs.com'],[],False
1,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,#IDFTerrorists\nحماسه تروریستهای اسرائیلی http...,2018-05-26 00:48,[],['IDFTerrorists'],False
2,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,en,Stop war on Yemen hospitals\n#ShameOnUN\n#Yemen,2018-06-16 20:06,[],"['ShameOnUN', 'Yemen']",False
3,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,لبیک یا فقیه\n#مجزرة_الدراز https://t.co/nKfQW...,2018-05-23 18:22,[],['مجزرة_الدراز'],False
4,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,اینجا تل ابیب است\nاینها اسراییلیهایی هستند که...,2019-01-28 16:56,[],['زندگی_سگی_اسرائیلیها'],False
5,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,ar,وامروز هم ....\n#زندگی_سگی_اسرائیلیها https://...,2018-09-07 10:42,[],['زندگی_سگی_اسرائیلیها'],False
6,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,حکمت ثابت موندن اسم ماههای قمری بعد از تغییر ز...,2017-11-19 19:40,[],[],False
7,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,جمله ای که سیدحسن امشب گفت در مورد انقلاب اسلا...,2019-02-06 18:26,[],['إن_مع_الصبر_نصرا'],False
8,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,fa,بشنوید مدح حاج محمودآقوی کریمی رو با لهجه شیرا...,2017-08-04 12:25,['https://twitter.com/khanisadiq/status/893438...,[],False
9,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,ar,RT @awadazeinab1: كم ساعة مع ابني بالمستشفى شف...,2019-01-25 17:48,[],[],True


#### c.) Filter Tweets and keeps those that are either 
 - account located in Venezuela
 - account language Spanish OR
 - tweet language Spanish

In [6]:
tweets_clean = tweets_clean[(tweets_clean.user_reported_location == 'Venezuela') | (tweets_clean.account_language == 'es') | (tweets_clean.tweet_language == 'es')].reset_index().drop(columns= ['index'])

#### d.) Take out Tweets that are set in European and US location 

In [7]:
tweets_clean = tweets_clean[(tweets_clean.user_reported_location != 'London') & (tweets_clean.user_reported_location != 'Manhattan, NY') & (tweets_clean.user_reported_location != 'Brooklyn, NY') & (tweets_clean.user_reported_location != 'Queens, NY') & (tweets_clean.user_reported_location != 'New York, NY') & (tweets_clean.user_reported_location != 'California, USA') & (tweets_clean.user_reported_location != 'New Jersey, USA') &  (tweets_clean.user_reported_location != 'North Holland, The Netherlands') & (tweets_clean.user_reported_location != 'Atlantic City, NJ') & (tweets_clean.user_reported_location != 'Mountain View, CA') & (tweets_clean.user_reported_location != 'New York, USA') & (tweets_clean.user_reported_location != 'Canada') & (tweets_clean.user_reported_location != 'San Francisco, CA') & (tweets_clean.user_reported_location != 'Washington, USA') & (tweets_clean.user_reported_location != 'Washington, DC') & (tweets_clean.user_reported_location != 'España') & (tweets_clean.user_reported_location != 'Germany') & (tweets_clean.user_reported_location != 'Nantes, France') & (tweets_clean.user_reported_location != 'Houston, TX') & (tweets_clean.user_reported_location != 'Texas,San Antonio') & (tweets_clean.user_reported_location != 'Chicago') & (tweets_clean.user_reported_location != 'Atlanta') & (tweets_clean.user_reported_location != 'Washington,Seattle') & (tweets_clean.user_reported_location != 'Fremont, CA') & (tweets_clean.user_reported_location != 'France') & (tweets_clean.user_reported_location != 'England, United Kingdom')  & (tweets_clean.user_reported_location != 'Oregon,Portland')  & (tweets_clean.user_reported_location !='USA')  & (tweets_clean.user_reported_location != 'Florida,Orlando') & (tweets_clean.user_reported_location != 'Califor') & (tweets_clean.user_reported_location !='California,Los Angeles') & (tweets_clean.user_reported_location !='Illinois, USA') & (tweets_clean.user_reported_location !='Arizona,phoenix') & (tweets_clean.user_reported_location !='Pennsylvania,Pittsburgh') & (tweets_clean.user_reported_location !='Pennsylvania,Philadelphia') & (tweets_clean.user_reported_location !='Dallas, TX') ]

# Top 10 rows of data
tweets_clean.head(10)

Unnamed: 0,user_screen_name,user_display_name,user_reported_location,account_language,tweet_language,tweet_text,tweet_time,urls,hashtags,is_retweet
0,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,es,No parallel in history....\n#Hussain https://t...,2018-09-19 20:04,[],['Hussain'],False
1,akhonfellah,⁦🇮🇷⁩أخٌ‌في‌الله,Iran,en,es,HABIL CAFE😂 https://t.co/5ipZYvYA8X,2017-10-04 03:28,['https://twitter.com/KnowKaduna/status/915295...,[],False
2,Romeo1997er,Romeo,,fa,es,RT @countdown2040: Gazans prepare for Hajj\nht...,2017-08-29 03:17,['http://www.countdown2040.com/ShowGallery/69/'],[],True
3,Kasia36790875,Kasia,,fa,es,RT @countdown2040: Abbas sends medical aid to ...,2017-11-18 16:36,['http://www.countdown2040.com/ShowNews/1014/'],[],True
4,Hmn90432381,H.m.n,,fa,es,RT @countdown2040: Gazans prepare for Hajj\nht...,2017-09-24 14:28,['http://www.countdown2040.com/ShowGallery/69/'],[],True
5,koInQlW0rKxQPoTuf5BmVjKyTvYJR5JdKeo8spDdrwM=,koInQlW0rKxQPoTuf5BmVjKyTvYJR5JdKeo8spDdrwM=,,fa,es,RT @countdown2040: Abbas sends medical aid to ...,2017-10-30 16:52,['http://www.countdown2040.com/ShowNews/1014/'],[],True
6,24evm+SfMta5ONKMRjQe1Qj39PdyLGPqMMXl8XYDbg=,24evm+SfMta5ONKMRjQe1Qj39PdyLGPqMMXl8XYDbg=,,en,es,RT @countdown2040: Video: Lana Del Rey Ignores...,2018-08-28 08:33,['http://www.countdown2040.com/ShowMovieList/9...,"['GroupPalestine', 'قروب_فلسطيني']",True
7,GP3PzukVFPWVoLtVWTEyy20m2lRZaMaRmN7n0lz7Bg=,GP3PzukVFPWVoLtVWTEyy20m2lRZaMaRmN7n0lz7Bg=,Earth,fa,es,RT @countdown2040: Gazans prepare for Hajj\nht...,2017-09-28 17:53,['http://www.countdown2040.com/ShowGallery/69/'],[],True
8,Richard80907,Richard,,fa,es,RT @countdown2040: Gazans prepare for Hajj\nht...,2017-11-05 15:47,['http://www.countdown2040.com/ShowGallery/69/'],[],True
9,UW2EZRTAv0C7rCy2LOI2SBiYh8IrdwmQAdI7p7yqok=,UW2EZRTAv0C7rCy2LOI2SBiYh8IrdwmQAdI7p7yqok=,,en,es,"Abbas sends medical aid to Venezuela, sparking...",2017-08-23 08:02,['http://www.UW2EZRTAv0C7rCy2LOI2SBiYh8IrdwmQA...,[],False


#### e.) Quality control for Twitter language identification 
- Import SpaCy, a text-processing package, and load Spanish and English models and Language Identifier

In [14]:
import spacy
from spacy_langdetect import LanguageDetector

# Initialize spacy with the SPANISH model
sp = spacy.load('es_core_news_sm')
sp.add_pipe(LanguageDetector(), name = 'language_detector', last = True)
eng = spacy.load('en_core_web_sm')

- Identify language of each Tweet using SpaCy language identification 

In [5]:
def detect_language(df, content_column):
    '''
    A function to detect the language in each tweet and add to new row

    Argument: a dataframe  and content column
    Ouput: same dataframe with a new 'cleaned_content' column
    '''

    # Initialize list of languages
    spacy_language_detection = []

    # Call detect the language for each row in the data frame and append to spacy_language_detection list
    for row in df[content_column]:
        doc = sp(row)
        spacy_language_detection.append(doc._.language['language'])

    # Append language list to the data frame
    df['spacy_language_detection'] = spacy_language_detection

    return df 

In [None]:
tweets_clean = detect_language(df = tweets_clean, content_column = 'tweet_text')

- Keep Tweets that are marked as either Spanish by BOTH Twitter and SpaCy or English by BOTH Twitter and SpaCy.
- Separate into 'Spanish' and 'English' datasets.

In [None]:
spanish_tweets = tweets_clean[(tweets_clean.tweet_language == 'es') & (tweets_clean.spacy_language_detection == 'es')]
english_tweets = tweets_clean[(tweets_clean.tweet_language == 'en') & (tweets_clean.spacy_language_detection == 'en')]

## Step 2: Keyword analysis

#### a.) Use SpaCy to “clean” each Tweet by removing stop words (very common words that carry little meaning), non alpha-numeric characters, and by reducing each word to it’s lemma or root. 
This will allow us to create a consistent vocabulary to analyze keywords.

In [7]:
def clean_string(text_string, language):
    '''
    A function to clean a string using SpaCy, removing stop-words and non-alphanumeric characters

    Argument: a text string and a language ('English' or 'Spanish')
    Output: a cleaned string

    '''
    if language == 'Spanish':
    # Parse the text string using the english model initialized earlier
        doc = sp(text_string)
    elif language == 'English':
        doc = eng(text_string)
    
    # Initialize empty string
    clean = []

    # Add each token to the list if it is not a stop word, is alphanumeric, and if it's not a pronoun
    for token in doc:
        
        if token.is_alpha == False or token.is_stop == True:
            pass
        else:
            clean.append(token.lemma_)

    # Join the list into a string
    clean = " ".join(clean)

    return clean

def clean_content(df, content_column, language):
    '''
    A function to clean all the strings in a whole of a corpus

    Argument: a dataframe, the name of the column with the content, and a language ('Spanish' or 'English')
    Ouput: same dataframe with a new cleaned content column
    '''

    # Initialize list of cleaned content strings
    clean_content= []

    # Call clean_string() for each row in the data frame and append to clean_content list
    for row in df[content_column]:
        clean_content.append(clean_string(row, language))

    # Append clean_content list to the data frame
    df['lemmatized_tweet_text'] = clean_content

    return df 

***Here is an example of how this works***

In [25]:
example_sp = spanish_tweets.loc[:,'tweet_text'][0]
example_sp_clean = clean_string(example_sp, 'Spanish')
print("Raw Spanish example: \n" + example_sp)
print("\nClean exmaple: \n" + example_sp_clean)


example_eng = english_tweets.loc[:,'tweet_text'][4]
example_eng_clean = clean_string(example_eng, "English")
print("\n\n\nRaw English example: \n" + example_eng)
print("\nClean exmaple: \n" + example_eng_clean)

Raw Spanish example: 
RT @MFloresBazaldua: lo que no ves con tus ojos, no lo inventes con tu boca http://t.co/oOzAorw7Fc

Clean exmaple: 
RT ver ojo inventar boca



Raw English example: 
Great tool. Very easy to use and it does the job very well! I highly recommend it! #BulkFollower https://t.co/SzGtLXW16B

Clean exmaple: 
great tool very easy use job -PRON- highly recommend bulkfollower


In [26]:
spanish_tweets = pd.read_csv('xlsx/spanish_tweets.csv', encoding='utf-8-sig', index_col = 0).reset_index().drop(columns='index')
english_tweets = pd.read_csv('xlsx/english_tweets.csv', encoding='utf-8-sig', index_col = 0).reset_index().drop(columns='index')

#### b.) Use Scikit-learn ‘CountVectorizer’ package to create word counts for Spanish and English tweets
First we create a matrix where each column is a word found in any Tweet and each row is the number of times in occurs in each Tweet.

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
# only words that appear in more than 5 tweets
word_vectorizer = CountVectorizer(encoding='utf-8-sig', analyzer='word', min_df=5, ngram_range=(1,1))
word_count_sm = word_vectorizer.fit_transform(spanish_tweets['lemmatized_tweet_text'])

Then we sum the count of each word over all Tweets

In [None]:
words = word_vectorizer.get_feature_names()
word_count_total = word_count_sm.sum(axis=0)
word_count_total_df = pd.DataFrame(word_count_total, columns = words)

#### b.) Use Scikit-learn ‘CountVectorizer’ package to create phrase counts for Spanish and English tweets
We set the length of phrases to be between 3 and 5 words long for one data frame and between 5 to 7 for the other.

In [None]:
phrase_vectorizer = CountVectorizer(encoding='utf-8-sig', analyzer='word', min_df=5, ngram_range=(5,7))
# create matrix where each column is a phrase and each row is a count in each tweet
phrase_count_sm = phrase_vectorizer.fit_transform(spanish_tweets['lemmatized_tweet_text'])
phrases = phrase_vectorizer.get_feature_names()
# sum the count of each phrase over all Tweets
phrase_count_total = phrase_count_sm.sum(axis=0)
phrase_count_total_df = pd.DataFrame(phrase_count_total, columns = phrases)