In [87]:
# Standard library imports
import os
import re
import csv
import sys
import html
from datetime import datetime
from collections import defaultdict

# Third-party library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
import psutil
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Local application/library specific imports
import utils
from utils import *



from unidecode import unidecode
import importlib

In [88]:
# # How much memory available?
# def get_available_memory():
#     return psutil.virtual_memory().available

# available_memory = get_available_memory()
# print(f"Available memory: {available_memory / (1024 * 1024 * 1024)} GB")


# # check size of loaded df and print 
# def get_df_memory_usage(df):
#     return df.info(memory_usage='deep')

# get_df_memory_usage(df)

## 1. Filtering french brands

Here, we'll aim to select only the french brands. 
- One approach is to use language detection. Via the gcd3 library (?)
- Another approach could be to look at location


Plan:
- Load the df of the brands and their bios. 
- Filter
- Now, filter the reduced follower df based on these final french brands

- are all users now french ? (look at combination of language and location - potentially drop those that does not have any usable indicator)


- Might need to manually inspect these later on

In [89]:
#reload(utils)
importlib.reload(utils)

#How is my data delimited?
path = '/home/livtollanes/SocialMarkers'
file = 'markers_followers_bios_2023-05-19.csv'

utils.print_lines(path, file, 0,5)

#The data is comma delimited

twitter_id,id,screen_name,name,description,url,timestamp_utc,local_time,location,verified,protected,tweets,followers,friends,likes,lists,image,default_profile,default_profile_image,witheld_in_countries,witheld_scope

3342215494,3342215494,titisanogo8,Titi sanogo,Je crois en DIEU et à mon travail j'y arriverai.....,,1435017944,2015-06-23T00:05:44,"Ile-de-France, France",0,0,6,44,733,91,0,https://pbs.twimg.com/profile_images/1249394390029742081/xuVolLn6_normal.jpg,1,0,,

3115495713,3115495713,AndreDeybach,DEYBACH André,,,1427309108,2015-03-25T18:45:08,,0,0,0,1,40,0,0,https://pbs.twimg.com/profile_images/580803690757533697/pHNcCBLh_normal.jpg,1,0,,

244075010,244075010,matttownley1985,Matt Townley,"Hotelier, traveller, fan of all things hospitality, great food and fine wine! All views my own etc!!",,1296220595,2011-01-28T13:16:35,"Manchester, England",0,1,2535,772,1264,1251,7,https://pbs.twimg.com/profile_images/928075998930681856/ZFXboKc3_normal.jpg,0,0,,

2986463442,2986463442,alex_guev

In [90]:
#load the markers_bios_2023-05-19.csv file
path = '/home/livtollanes/SocialMarkers'
file = 'markers_bios_2023-05-19.csv'

req_cols = ['twitter_name', 'id', 'screen_name', 'description', 'location', 'tweets', 'followers', 'friends', 'likes', 'lists','timestamp_utc']

dtypes = {'twitter_name':'object', 
          'id': 'float64',
          'screen_name': 'object', 
          'description': 'object',
          'location': 'object',
          'tweets': 'float64',
          'followers': 'float64',
          'friends': 'float64',
          'likes': 'float64',
          'lists': 'float64',
          'timestamp_utc': 'float64'}

dfm = utils.fileloader(path, file, req_cols, dtypes)

In [91]:
# Remove emojis, weird font, and detect language in descriptions

# Process the descriptions in the DataFrame
dfm = utils.process_description(dfm)

# Ssplit the DataFrame by language
dfm_french, dfm_other = utils.split_by_language(dfm, 'fr')

# Finally, print information about the resulting DataFrames
utils.print_df_info(dfm_french, 'dfm_french')
utils.print_df_info(dfm_other, 'dfm_other')

KeyboardInterrupt: 

Manual inspection - did we miss any brands that are french?
- This part was done by inspecting the data frame and looking up brands

The brands that were incorrectly detected as non french:

- 21: Lafuma_France
- 30: CarrefourFrance
- 36: CasinoEnseigne
- 37: Supermarche_G20
- 38: VogueFrance
- 39: FigaroMagazine
- 41: LeMediaTv
- 48: BFMTV
- 92: TeleLoisirs
- 94: ParisMatch
- 96: Telerama
- 101: EntMagazine
- 106: OnzeMondial
- 113: GQ_France
- 119: LEXPRESS
- 121: courrrierinter
- 124: RCLens
- 128: OL
- 129: ognice
- 131: StadeDeReims
- 133: MontpellierHSC
- 135: RCSA
- 171: HECParis
- 178: SciencesPo
- 181: Univbordeaux
- 182: UnivLyon1
- 199: UniversiteCergy
- 202: centralesupelec
- 208: ENSAEparis
- 215: esdes_BS
- 221: LaCoudouliere
- 226: LyceeProTissie
- 227: PSVLaTournelle
- 230: Decathlon
- 231: Darty_Officiel
- 233: Fnac
- 236: AmazonFrance

In [None]:
# Add extra brands that were incorrectly defined as non-french:
indices_to_change = [21, 30, 36, 37, 38, 39, 41, 48, 92, 94, 96, 101, 106, 113, 119, 121, 124, 128, 129, 131, 133, 135, 171, 178, 181, 182, 199, 202, 208, 215, 221, 226, 227, 230, 231, 233, 236]
path_tosave = '/home/livtollanes/NewData'
utils.add_extrabrands(indices_to_change, dfm_other, dfm_french, path_tosave)

So far, in this section, we have been working with the bios of the markers. We deleted all rows that were from brands that were not french. The final output of this section includes two dfs: one for the french brands and one for other brands (only selected columns)

In the next section, we will load the data with user bios and metadata. We will filter based on certain metrics, and also ignore all connections between followers and non french brands

## 2. Filtering out irrelevant users 

- In this part, we want to remove users that don't follow enough brands. 
- This is to match the procedure of He and Tsvetkova (2023)
- The purpose is to ensure we have enough information to generate SES estimates for users.

In [None]:
#Take a look at the data (Of the markers and their followers)
# path = '/home/livtollanes/SocialMarkers'
# file = 'markers_followers_2023-05-19.csv'


# def print_lines(path, file, start_line=0, end_line=10):
#     with open(f"{path}/{file}", 'r') as f:
#         for i in range(end_line):
#             line = f.readline()
#             if i >= start_line:
#                 print(line)

# print_lines(path, file)


In [None]:
#Import the french metadata
path = '/home/livtollanes/NewData'
file = 'french_brands.csv'

req_cols = ['twitter_name', 'id', 'screen_name', 'description', 'timestamp_utc', 'location', 'tweets', 'followers', 'friends', 'likes', 'lists', 'description_noems', 'language', 'corrected_language_country']

dtypes = {'twitter_name':'object', 
          'id': 'float64',
          'screen_name': 'object', 
          'description': 'object',
          'location': 'object',
          'tweets': 'float64',
          'followers': 'float64',
          'friends': 'float64',
          'likes': 'float64',
          'lists': 'float64',
          'timestamp_utc': 'float64',
          'description_noems': 'object',
          'language': 'object',
          'corrected_language_country': 'object'}

french_marker_bios = utils.fileloader(path, file, req_cols, dtypes)

In [None]:
#Import the list of marker ids and their followers' ids
path = '/home/livtollanes/SocialMarkers'
file = 'markers_followers_2023-05-19.csv'

req_cols = ['id', 'follower_id']
dtypes = {'id': 'float64',
          'follower_id': 'float64'}

markers_followers = utils.fileloader(path, file, req_cols, dtypes)

In [93]:
#We only care about french brands

# From the df with brand id and follower id, remove all rows with non french brands 
french_markers_followers = markers_followers[markers_followers['id'].isin(french_marker_bios['id'])]

# Save the filtered data
french_markers_followers.to_csv('/home/livtollanes/NewData/french_markers_followers_2023-05-19.csv', index=False)




AttributeError: 'str' object has no attribute 'iterrows'

In [94]:
# How many followers are there per brand?
importlib.reload(utils)
filepath = '/home/livtollanes/NewData/french_markers_followers_2023-05-19.csv'
followers_per_brand_count = utils.create_followers_per_brand_dict(filepath)

# Filter the dictionary to only include brands with more or equal to 10,000 followers
brands_10000_followers = {brand: followers for brand, followers in followers_per_brand_count.items() if followers > 9999}

# Print the number of such brands
print(len(brands_10000_followers))

# Save the filtered data
#french_markers_followers.to_csv('/home/livtollanes/NewData/french_markers_followers_2023-05-19.csv', index=False)

AttributeError: 'str' object has no attribute 'iterrows'

In [None]:
#count the unique ids in french_marker_bios and in filtered_markers_followers
print(french_marker_bios['id'].nunique())
print(markers_followers['id'].nunique())
print(french_markers_followers['id'].nunique())

#print the unique ids that occur in french_marker_bios but not in filtered_markers_followers
print(set(french_marker_bios['id']) - set(french_markers_followers['id']))

# Get the unique ids that occur in dfm_french but not in filtered_markers_followers
missing_ids = set(french_marker_bios['id']) - set(filtered_markers_followers['id'])

# Filter french_marker_bios to only include rows with these ids
missing_brands = french_marker_bios[french_marker_bios['id'].isin(missing_ids)]

# Print the missing brands
print(missing_brands)


179
236
176
{9.8328592828117e+17, 9.101276052774952e+17, 9.05803918931886e+17}


Obs: For three brands, existing in the french brands bio df, there are no data in the marker_follower data. Must inspect later

#### Identify the number of brands each unique follower follows

Create dictionary of [keys: follower_Id, Value: Brand_id]
Without loading the data into python

In [None]:
#Create a dictionary of [keys:follower_id, values: brands] based on the link between french brands and their followers
importlib.reload(utils)

filepath = '/home/livtollanes/NewData/french_markers_followers_2023-05-19.csv'
brands_per_follower, brands_per_follower_count = utils.create_brands_per_follower_dict(filepath)

#How many unique values (brands) are there in the dictionary?

utils.inspect_dict(brands_per_follower, 5)



The number of unique values in the dictionary is 176.
The number of keys in the dictionary is 16703791.
First 5 items in the dictionary:
('1.6598901997850952e+18', {'861574608.0', '25487201.0'})
('1.2184586250285627e+18', {'25487201.0'})
('1.603050305813418e+18', {'133663801.0', '25487201.0'})
('1.65979527557623e+18', {'25487201.0'})
('1.6598678949545697e+18', {'25487201.0'})


In [None]:
# How many people follow less than five brands? Remove these from the dictionary
num_brands = 5
num_items = 5
filtered_brands_per_follower = utils.inspect_and_filter_followers(brands_per_follower, num_brands, num_items, remove=True)

The number of keys that follow less than 5 IDs is 15069849, which is 90.22% of the total. Removing these leaves 9.78% of the data, or 1633942 users.
Deleting rows ...
First 5 items in the filtered dictionary:
('1.5634751055898867e+18', {'34570323.0', '123564292.0', '33893706.0', '133663801.0', '19976004.0', '25487201.0'})
('553059033.0', {'36383320.0', '76017958.0', '492648852.0', '34917842.0', '83864876.0', '1.0841564120900444e+18', '168619698.0', '94131301.0', '318695478.0', '63142684.0', '25487201.0', '3025757015.0'})
('1.6596158075449344e+18', {'329683737.0', '125332632.0', '126244275.0', '33893706.0', '338891581.0', '47902100.0', '804263442.0', '25487201.0'})
('1.6570728203445043e+18', {'329683737.0', '125332632.0', '114710148.0', '627377507.0', '33893706.0', '338891581.0', '50592815.0', '47902100.0', '804263442.0', '25487201.0', '318695478.0'})
('7139582.0', {'156318405.0', '19900973.0', '63142684.0', '34570323.0', '492648852.0', '95455794.0', '133663801.0', '53029114.0', '199760

In [None]:
# Save the list of users that followe more than five brands to a .pkl file
with open('/home/livtollanes/NewData/french_markers_follow5.pkl', 'wb') as f:
     pickle.dump(filtered_brands_per_follower, f)


# #load the pickle list from file
# with open('/path/to/your/directory/french_markers_follow5.pkl', 'rb') as f:
#      followers_filtered = pickle.load(f)

Now, filter the actual follower df based on the dictionary

In [None]:
#Loading in one data set
path = '/home/livtollanes/SocialMarkers'
file = 'markers_followers_bios_2023-05-19.csv'

req_cols = ['twitter_id', 'id', 'screen_name', 'description', 'location', 'tweets', 'followers', 'friends', 'likes', 'lists','timestamp_utc']

dtypes = {
    'twitter_id': 'int64',
    'id': 'float64',
    'screen_name': 'object',
    'description': 'object',
    'location': 'object',
    'tweets': 'float64',
    'followers': 'float64',
    'friends': 'float64',
    'witheld_in_countries': 'float64'
}

frenchm_follower_bios = utils.fileloader(path, file, req_cols, dtypes)

In [None]:
# Load the dictionary from the .pkl file
with open('/home/livtollanes/NewData/french_markers_follow5.pkl', 'rb') as f:
    followers_dict = pickle.load(f)


In [None]:
# Convert the keys in followers_dict to integers
follower_ids = [float(key) for key in followers_dict.keys()]

# Filter the DataFrame to only include the rows where 'twitter_id' is in the follower_ids list
filtered_df = frenchm_follower_bios[frenchm_follower_bios['twitter_id'].isin(follower_ids)]

# Print the first few rows of the filtered DataFrame
print(filtered_df.head())

            twitter_id            id   screen_name  \
0           3342215494  3.342215e+09   titisanogo8   
1           3115495713  3.115496e+09  AndreDeybach   
7   738440598865088512  7.384406e+17   isusername1   
28           912721573  9.127216e+08    Armelrichy   
45  900384561640726530  9.003846e+17     SLIVIN001   

                                          description  timestamp_utc  \
0   Je crois en DIEU et à mon travail j'y arrivera...   1.435018e+09   
1                                                 NaN   1.427309e+09   
7                                                 NaN   1.464893e+09   
28                                                NaN   1.351526e+09   
45                                                NaN   1.503503e+09   

                 location  tweets  followers  friends  likes  lists  
0   Ile-de-France, France     6.0       44.0    733.0   91.0    0.0  
1                     NaN     0.0        1.0     40.0    0.0    0.0  
7                     NaN     0.

In [None]:
# Print the number of rows in the DataFrame
print(f"Number of rows: {filtered_df.shape[0]}")
print(f"Number of columns: {filtered_df.shape[1]}")

Number of rows: 1447587
Number of columns: 11


Remove all users that have sent less than 100 tweets

In [None]:
# Filter the DataFrame to only include users with more than 100 tweets
filtered_df2 = filtered_df[filtered_df['tweets'] > 99]

#print the number of rows in the filtered df
print(f"Number of rows after removing all with less than 100 tweets: {filtered_df2.shape[0]}")

#Print how many rows were removed
print(f"Number of rows removed: {filtered_df.shape[0] - filtered_df2.shape[0]}")

Number of rows after removing all with less than 100 tweets: 530919
Number of rows removed: 916668


In [None]:
#  Only include users with more than 25 followers
filtered_df3 = filtered_df2[filtered_df2['followers'] > 24]

# Print the number of rows in the filtered DataFrame   
print(f"Number of rows after removing all with less than 25 followers: {filtered_df3.shape[0]}")

# Print how many rows were removed
print(f"Number of rows removed: {filtered_df2.shape[0] - filtered_df3.shape[0]}")

Number of rows after removing all with less than 25 followers: 445418
Number of rows removed: 85501


In [None]:
# Now write the filtered_df3 to a csv file
# filtered_df3.to_csv('/home/livtollanes/NewData/XXX', index=False)

### Filterings
- Follow at least five brands
- sent at least 100 tweets
- have at least 25 followers
- sent at least five tweets in the first few months of the year the data was collected (maybe not relevant for this data - the twitter bios data does not contain the tweets. Only creation date for the profile)