Importation of packages

In [17]:
import gc

import matplotlib.pyplot as plt 
import numpy as np
import csv

Definition of time constants 


In [18]:
START_DATE = np.datetime64('2014-07-01')
END_DATE = np.datetime64('2018-06-01')


Filter the data : 
- Keep only tweets written in english 
- Remove links in the tweets
- Remove columns that we do not need 
- Reformat dates

In [7]:
from filter_data import * 

#Call the function with the input file name and column indices
for i in range(1, 14): 
    input_path= "./data/original_data/IRAhandle_tweets_{}.csv".format(i)
    output_path= "./data/filtered_data/IRAhandle_tweets_{}.csv".format(i)
    filter_csv(input_path, output_path, column_indices, 4, "English")
    extract_date_columns(output_path, output_path)
    remove_links(output_path, output_path)

Define the date array 

In [19]:
def get_date_array(): 
    # Calculate the number of days between start and end dates
    num_days = (END_DATE - START_DATE).astype(int) + 1
    
    # Create an array of timedelta objects representing each day
    timedelta_array = np.arange(num_days)
    
    # Convert timedelta array to datetime array by adding it to start date
    date_array = START_DATE + timedelta_array * np.timedelta64(1, 'D')
    
    return date_array

We can now plot the number of tweets accross time of our dataset

In [9]:
import datetime



# Number of tweets per day
def get_number_of_tweet_each_day(input_file, date_array): 


    number_of_tweets_per_day = np.zeros(date_array.shape)
    
    
    # Open the input file for reading
    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        
        # Read the header
        header = next(reader)
        
        # Process each row in the input file
        for row in reader:
            year = int(row[3])
            month = int(row[4])
            day = int(row[5])
            date = datetime.date(year, month, day)
            numpy_date = np.datetime64(date)


            # find the index of the date in the date_range
            index = np.where(date_array == numpy_date)

            # Increment the number of tweets for that day
            number_of_tweets_per_day[index] += 1  
        
        return number_of_tweets_per_day
           
# print(date_array)

date_array = get_date_array()

print(date_array.shape)
print(np.where(date_array == "2018-01-01"))
number_of_tweets_per_day= np.zeros(date_array.shape)

for i in range(1, 14):
    number_of_tweets_per_day += get_number_of_tweet_each_day("./data/filtered_data/IRAhandle_tweets_" + str(i) + ".csv", date_array)

print(np.sum(number_of_tweets_per_day, axis=0))
%matplotlib qt

plt.plot(date_array, number_of_tweets_per_day)
plt.show()


(1432,)
(array([], dtype=int64),)
2116719.0


MESA-LOADER: failed to open iris: /usr/lib/dri/iris_dri.so: cannot open shared object file: No such file or directory (search paths /usr/lib/x86_64-linux-gnu/dri:\$${ORIGIN}/dri:/usr/lib/dri, suffix _dri)
failed to load driver: iris
MESA-LOADER: failed to open swrast: /usr/lib/dri/swrast_dri.so: cannot open shared object file: No such file or directory (search paths /usr/lib/x86_64-linux-gnu/dri:\$${ORIGIN}/dri:/usr/lib/dri, suffix _dri)


Free memory

In [12]:
import gc
del number_of_tweets_per_day
del date_array
gc.collect()

3269

Function contains 

In [10]:
def contains_any_word(text, word_set):
    text_words = set(text.split())
    return not text_words.isdisjoint(word_set)

Single file dictionary based statistics

In [28]:
def get_number_of_tweets_per_day_with_at_least_one_word(input_file, dictionary, date_array): 
    
    nb_of_tweets_per_day_positive = np.zeros(date_array.shape)
    
    # Open the input file for reading
    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        
        
        # Read the header
        header = next(reader)
        
        # Process each row in the input file
        for row in reader:
            
            text = row[1]
            
            if contains_any_word(text, dictionary): 
                
                year = int(row[3])
                month = int(row[4])
                day = int(row[5])
                date = datetime.date(year, month, day)
                numpy_date = np.datetime64(date)
                
    
                # find the index of the date in the date_range
                index = np.where(date_array == numpy_date)
    
                # Increment the number of tweets for that day
                nb_of_tweets_per_day_positive[index] += 1  
                
    return nb_of_tweets_per_day_positive
    

Test function 

In [31]:
date_array = get_date_array()
dictionary = {"Hi"}
nb = get_number_of_tweets_per_day_with_at_least_one_word("./data/filtered_data/IRAhandle_tweets_1.csv",dictionary, date_array)
print(np.sum(nb))
plt.plot(date_array, nb)
plt.show()

del date_array
del nb
gc.collect()

45.0


23469

Analysis on all files 


In [51]:
def all_files_analysis(dictionary, date_array): 
    nb = np.zeros(date_array.shape)
    for i in range(1, 14): 
        nb += get_number_of_tweets_per_day_with_at_least_one_word("./data/filtered_data/IRAhandle_tweets_{}.csv".format(i),dictionary , date_array)
    return nb


Test

In [52]:
date_array= get_date_array()
dictionary = {"Hi"}
nb = all_files_analysis(dictionary, date_array)

plt.plot(date_array, nb)
plt.show()
print(np.sum(nb, axis=0))
del nb
del date_array
gc.collect()

448.0


2990

Parallel version

In [54]:
import concurrent.futures
def analyze_file(file_index, dictionary, date_array):
    file_path = f"./data/filtered_data/IRAhandle_tweets_{file_index}.csv"
    return get_number_of_tweets_per_day_with_at_least_one_word(file_path, dictionary, date_array)

def all_files_analysis_par(dictionary, date_array):
    nb = np.zeros(date_array.shape)
    file_indices = range(1, 14)  
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(analyze_file, i, dictionary, date_array) for i in file_indices]
        for future in concurrent.futures.as_completed(futures):
            nb += future.result()

    return nb



Test


In [55]:
date_array= get_date_array()
dictionary = {"Hi"}
nb = all_files_analysis_par(dictionary, date_array)
plt.plot(date_array, nb)
plt.show()
print(np.sum(nb, axis=0))
del nb
del date_array
gc.collect()

448.0


2987