In [48]:
# Standard library imports
import os
import re
import csv
import sys
import html
from datetime import datetime
from collections import defaultdict

# Third-party library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
import psutil
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Local application/library specific imports
import utils
from utils import *



from unidecode import unidecode
import importlib

## 1. Filtering french brands

Here, we'll aim to select only the french brands. 
- One approach is to use language detection. Via the gcd3 library (?)
- Another approach could be to look at location


Plan:
- Load the df of the brands and their bios. 
- Filter
- Now, filter the reduced follower df based on these final french brands

- are all users now french ? (look at combination of language and location - potentially drop those that does not have any usable indicator)


- Might need to manually inspect these later on

### Preprocessing data files

In [58]:
#Load and rename markersIDs in markers_bios

load_path = '/home/livtollanes/SocialMarkers'
save_path = '/home/livtollanes/NewData'
file = 'markers_bios_2023-05-19.csv'


req_cols = ['twitter_name', 'id', 'screen_name', 'description', 'location', 'tweets', 'followers', 'friends', 'likes', 'lists','timestamp_utc']
dtypes = {'twitter_name':'object', 
          'id': 'float64',
          'screen_name': 'object', 
          'description': 'object',
          'location': 'object',
          'tweets': 'float64',
          'followers': 'float64',
          'friends': 'float64',
          'likes': 'float64',
          'lists': 'float64',
          'timestamp_utc': 'float64'}

new_column_names = {'id': 'marker_id'}

markers_bios = utils.load_and_rename(load_path, save_path, file, req_cols, dtypes, new_column_names)


In [62]:
#Load and rename markers followers
load_path = '/home/livtollanes/SocialMarkers'
save_path = '/home/livtollanes/NewData'
file = 'markers_followers_2023-05-19.csv'

req_cols = ['id', 'follower_id']
dtypes = {'id': 'float64',
          'follower_id': 'float64'}

new_column_names = {'id': 'marker_id'}

markers_followers = utils.load_and_rename(load_path, save_path, file, req_cols, dtypes, new_column_names)

In [63]:
#Load and rename markers followers bios
load_path = '/home/livtollanes/SocialMarkers'
save_path = '/home/livtollanes/NewData'
file = 'markers_followers_bios_2023-05-19.csv'


req_cols = ['twitter_id', 'id', 'screen_name', 'description', 'location', 'tweets', 'followers', 'friends', 'likes', 'lists','timestamp_utc']

dtypes = {
    'twitter_id': 'int64',
    'id': 'float64',
    'screen_name': 'object',
    'description': 'object',
    'location': 'object',
    'tweets': 'float64',
    'followers': 'float64',
    'friends': 'float64',
    'witheld_in_countries': 'float64'
}

new_column_names = {'id': 'follower_id'}

follower_bios = utils.load_and_rename(load_path, save_path, file, req_cols, dtypes, new_column_names)

In [64]:
#reload(utils)
importlib.reload(utils)

#How is my data delimited?
path = '/home/livtollanes/NewData'
file1 = 'markers_bios_2023-05-19.csv'
file2 = 'markers_followers_2023-05-19.csv'
file3 = 'markers_followers_bios_2023-05-19.csv'


utils.print_lines(path, file1, 0,1)
utils.print_lines(path, file2, 0,1)
utils.print_lines(path, file3, 0,1)

#The data is comma delimited

Printing lines from file: markers_bios_2023-05-19.csv
twitter_name,marker_id,screen_name,description,timestamp_utc,location,tweets,followers,friends,likes,lists

Printing lines from file: markers_followers_2023-05-19.csv
marker_id,follower_id

Printing lines from file: markers_followers_bios_2023-05-19.csv
twitter_id,follower_id,screen_name,description,timestamp_utc,location,tweets,followers,friends,likes,lists



### Now filtering on language

In [65]:
# Remove emojis, weird font, and detect language in descriptions

# Process the descriptions in the DataFrame
markers_bios = utils.process_description(markers_bios)

# Ssplit the DataFrame by language
markers_bios_french, markers_bios_other = utils.split_by_language(markers_bios, 'fr')

# Finally, print information about the resulting DataFrames
utils.print_df_info(markers_bios_french, 'markers_bios')
utils.print_df_info(markers_bios_other, 'markers_bios_other')

Number of rows in markers_bios: 142
Number of rows in markers_bios_other: 95


Manual inspection - did we miss any brands that are french?
- This part was done by inspecting the data frame and looking up brands

The brands that were incorrectly detected as non french:

- 21: Lafuma_France
- 30: CarrefourFrance
- 36: CasinoEnseigne
- 37: Supermarche_G20
- 38: VogueFrance
- 39: FigaroMagazine
- 41: LeMediaTv
- 48: BFMTV
- 92: TeleLoisirs
- 94: ParisMatch
- 96: Telerama
- 101: EntMagazine
- 106: OnzeMondial
- 113: GQ_France
- 119: LEXPRESS
- 121: courrrierinter
- 124: RCLens
- 128: OL
- 129: ognice
- 131: StadeDeReims
- 133: MontpellierHSC
- 135: RCSA
- 171: HECParis
- 178: SciencesPo
- 181: Univbordeaux
- 182: UnivLyon1
- 199: UniversiteCergy
- 202: centralesupelec
- 208: ENSAEparis
- 215: esdes_BS
- 221: LaCoudouliere
- 226: LyceeProTissie
- 227: PSVLaTournelle
- 230: Decathlon
- 231: Darty_Officiel
- 233: Fnac
- 236: AmazonFrance

In [66]:
# Add extra brands that were incorrectly defined as non-french:
indices_to_change = [21, 30, 36, 37, 38, 39, 41, 48, 92, 94, 96, 101, 106, 113, 119, 121, 124, 128, 129, 131, 133, 135, 171, 178, 181, 182, 199, 202, 208, 215, 221, 226, 227, 230, 231, 233, 236]

#Add and save file
path_tosave = '/home/livtollanes/NewData/workdata'
utils.add_extrabrands(indices_to_change, markers_bios_other, markers_bios_french, path_tosave)

markers_bios_french:
   twitter_name     marker_id  screen_name  \
1      adidasFR  2.548720e+07     adidasFR   
9   ASICSFrance  3.042088e+09  ASICSFrance   
10  KappaFrance  2.831937e+09  KappaFrance   
27     franprix  1.023951e+08     franprix   
28     Monoprix  6.247068e+07     Monoprix   

                                          description  timestamp_utc location  \
1                        𝕭𝖎𝖊𝖓𝖛𝖊𝖓𝖚𝖊 𝖆𝖚 𝕮𝖑𝖚𝖇 𝕺𝖗𝖎𝖌𝖎𝖓𝖆𝖑𝖘.   1.237546e+09    Paris   
9   A travers l'innovation, nous cherchons à t'ins...   1.424162e+09   France   
10  Peu importe tes performances, l'important rest...   1.411656e+09   France   
27  franprix, l'enseigne spécialiste de la proximi...   1.262791e+09   France   
28  Mode, beauté, food & déco. Toutes vos courses ...   1.249288e+09      NaN   

     tweets  followers  friends   likes  lists  \
1   16680.0   393006.0   1282.0  5287.0  433.0   
9    1703.0    20485.0    104.0   641.0   98.0   
10   3787.0    22493.0    532.0  4023.0   44.0   
27   8225.0    

Unnamed: 0,twitter_name,marker_id,screen_name,description,timestamp_utc,location,tweets,followers,friends,likes,lists,description_noems,language,corrected_language_country
1,adidasFR,2.548720e+07,adidasFR,𝕭𝖎𝖊𝖓𝖛𝖊𝖓𝖚𝖊 𝖆𝖚 𝕮𝖑𝖚𝖇 𝕺𝖗𝖎𝖌𝖎𝖓𝖆𝖑𝖘.,1.237546e+09,Paris,16680.0,393006.0,1282.0,5287.0,433.0,Bienvenue au Club Originals.,fr,
9,ASICSFrance,3.042088e+09,ASICSFrance,"A travers l'innovation, nous cherchons à t'ins...",1.424162e+09,France,1703.0,20485.0,104.0,641.0,98.0,"A travers l'innovation, nous cherchons a t'ins...",fr,
10,KappaFrance,2.831937e+09,KappaFrance,"Peu importe tes performances, l'important rest...",1.411656e+09,France,3787.0,22493.0,532.0,4023.0,44.0,"Peu importe tes performances, l'important rest...",fr,
27,franprix,1.023951e+08,franprix,"franprix, l'enseigne spécialiste de la proximi...",1.262791e+09,France,8225.0,11630.0,141.0,2306.0,175.0,"franprix, l'enseigne specialiste de la proximi...",fr,
28,Monoprix,6.247068e+07,Monoprix,"Mode, beauté, food & déco. Toutes vos courses ...",1.249288e+09,,30718.0,50806.0,4709.0,5940.0,570.0,"Mode, beaute, food & deco. Toutes vos courses ...",fr,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,PSVLaTournelle,1.179072e+18,PSVLaTournelle,,1.569948e+09,PONT SAINT VINCENT (54),592.0,289.0,282.0,506.0,3.0,,unknown,fr
230,Decathlon,1.262671e+08,Decathlon,À fond la forme,1.269508e+09,"Villeneuve d'Ascq (59,France)",13167.0,170643.0,372.0,5286.0,382.0,A fond la forme,en,fr
231,Darty_Officiel,1.956809e+08,Darty_Officiel,Suivez toute l'actualité de #Darty : bons plan...,1.285581e+09,France,22200.0,115315.0,156.0,1272.0,379.0,Suivez toute l'actualite de #Darty : bons plan...,en,fr
233,Fnac,8.806412e+06,Fnac,Compte officiel de la #Fnac : Actu High Tech /...,1.189502e+09,France,220101.0,501158.0,1136.0,6135.0,1056.0,Compte officiel de la #Fnac : Actu High Tech /...,en,fr


So far, in this section, we have been working with the bios of the markers. We deleted all rows that were from brands that were not french. The final output of this section includes two dfs: one for the french brands and one for other brands (only selected columns)

In the next section, we will load the data with user bios and metadata. We will filter based on certain metrics, and also ignore all connections between followers and non french brands

## 2. Finding french brands and their followers

- In this part, we want to filter the data to only include french brands with 10 000 or more followers

In [68]:
#Take a look at the data (Of the markers and their followers)
path = '/home/livtollanes/NewData/'
file = 'markers_followers_2023-05-19.csv'


def print_lines(path, file, start_line=0, end_line=5):
    with open(f"{path}/{file}", 'r') as f:
        for i in range(end_line):
            line = f.readline()
            if i >= start_line:
                print(line)

print_lines(path, file)


marker_id,follower_id

415859364.0,1.6553368048311747e+18

415859364.0,1.6596481414974543e+18

415859364.0,1.525534139478311e+18

415859364.0,1.6596488832096748e+18



In [78]:
markers_followers_french = utils.streamline_ids(markers_followers, 'marker_id', markers_bios_french, 'marker_id')


Removed 106747455 rows.
19597957 rows are left.


In [79]:
# Save the filtered data
markers_followers_french .to_csv('/home/livtollanes/NewData/workdata/french_markers_followers_2023-05-19.csv', index=False)

- Now, we have data for the french markers and their followers (markers_followers_french )
- we also have a df with all french markers and their bios (markers_bios_french)

Followers per brand - only including those with 10 000 +

In [86]:
# How many followers are there per brand? - faster to read the csv file rather than directly using the df
importlib.reload(utils)

filepath = '/home/livtollanes/NewData/workdata/french_markers_followers_2023-05-19.csv'
followers_per_brand_count = utils.create_followers_per_brand_dict(filepath)

# Filter the dictionary to only include brands with more or equal to 10,000 followers
brands_10000_followers = {brand: followers for brand, followers in followers_per_brand_count.items() if followers > 9999}

# Print the number of such brands
print(len(brands_10000_followers))

#Now, keeping only the marker_Ids in the two relevant dfs based on the brands_10000_followers
markers_bios_french_10k = utils.streamline_ids_dict(markers_bios_french, 'marker_id', brands_10000_followers)
markers_followers_french10k = utils.streamline_ids_dict(markers_followers, 'marker_id', brands_10000_followers)


102


Now, our french marker_follower data is filtered so that it only includes french brands with 10 000 followers or more

In [92]:
#Streamline the follower_bios to only include the followers of the 10k brands
follower_bios_french10k = utils.streamline_ids(follower_bios, 'follower_id', markers_followers_french10k, 'follower_id')

Removed 60323433 rows.
10343213 rows are left.


In [93]:
#Write the three files to csv
markers_bios_french_10k.to_csv('/home/livtollanes/NewData/workdata/french_markers_bios_10k_2023-05-19.csv', index=False)
markers_followers_french10k.to_csv('/home/livtollanes/NewData/workdata/french_markers_followers_10k_2023-05-19.csv', index=False)
follower_bios_french10k.to_csv('/home/livtollanes/NewData/workdata/french_follower_bios_10k_2023-05-19.csv', index=False)

In [None]:
# #count the unique ids in french_marker_bios and in filtered_markers_followers
# print(french_marker_bios['id'].nunique())
# print(markers_followers['id'].nunique())
# print(french_markers_followers['id'].nunique())

# #print the unique ids that occur in french_marker_bios but not in filtered_markers_followers
# print(set(french_marker_bios['id']) - set(french_markers_followers['id']))

# # Get the unique ids that occur in dfm_french but not in filtered_markers_followers
# missing_ids = set(french_marker_bios['id']) - set(filtered_markers_followers['id'])

# # Filter french_marker_bios to only include rows with these ids
# missing_brands = french_marker_bios[french_marker_bios['id'].isin(missing_ids)]

# # Print the missing brands
# print(missing_brands)


Obs: For three brands, existing in the french brands bio df, there are no data in the marker_follower data. Must inspect later

## 3. Filtering users

- Now that we have the french brands with enough followers, we want to remove users that don't follow enough of these 
- This is to match the procedure of He and Tsvetkova (2023)
- The purpose is to ensure we have enough information to generate SES estimates for users.

Create dictionary of [keys: follower_Id, Value: Brand_id]
Without loading the data into python

In [95]:
#Create a dictionary of [keys:follower_id, values: brands] based on the link between french brands and their followers
importlib.reload(utils)

filepath = '/home/livtollanes/NewData/workdata/french_markers_followers_10k_2023-05-19.csv'
brands_per_follower, brands_per_follower_count = utils.create_brands_per_follower_dict(filepath)

#How many unique values (brands) are there in the dictionary?

utils.inspect_dict(brands_per_follower, 5)



The number of unique values in the dictionary is 102.
The number of keys in the dictionary is 10346146.
First 5 items in the dictionary:
('1.6598901997850952e+18', {'861574608.0', '25487201.0'})
('1.2184586250285627e+18', {'25487201.0'})
('1.603050305813418e+18', {'25487201.0'})
('1.65979527557623e+18', {'25487201.0'})
('1.6598678949545697e+18', {'25487201.0'})


In [96]:
# How many people follow less than five french brands? Remove these from the dictionary
num_brands = 5
num_items = 5
filtered_brands_per_follower = utils.inspect_and_filter_followers(brands_per_follower, num_brands, num_items, remove=True)

The number of keys that follow less than 5 IDs is 9738747, which is 94.13% of the total. Removing these leaves 5.87% of the data, or 607399 users.
Deleting rows ...
First 5 items in the filtered dictionary:
('553059033.0', {'25487201.0', '3025757015.0', '318695478.0', '36383320.0', '63142684.0', '94131301.0', '76017958.0', '34917842.0', '1.0841564120900444e+18', '492648852.0'})
('1.6570728203445043e+18', {'114710148.0', '47902100.0', '25487201.0', '338891581.0', '318695478.0', '50592815.0', '804263442.0'})
('7139582.0', {'25487201.0', '17710206.0', '53029114.0', '95455794.0', '19856081.0', '96090970.0', '19976004.0', '94544423.0', '19900973.0', '63142684.0', '34570323.0', '492648852.0'})
('1.6347941001541386e+18', {'114710148.0', '47902100.0', '25487201.0', '804263442.0', '338891581.0', '318695478.0', '50592815.0', '34570323.0'})
('1.6592206427832648e+18', {'114710148.0', '25487201.0', '804263442.0', '122333150.0', '338891581.0', '36383320.0', '19976004.0', '94544423.0', '34570323.0', 

In [98]:
#Filter the follower bio file to only include the followers that follow at least 5 of the french brands with more than 10 000 followers
follower_bios_french10k_5 = utils.streamline_ids_dict(follower_bios_french10k, 'follower_id', filtered_brands_per_follower)

Removed 9735983 rows.
607230 rows are left.


Remove all users that have sent less than 100 tweets

In [100]:
# Filter the DataFrame to only include users with more than 100 tweets
follower_bios_french10k_5_100 = follower_bios_french10k_5[follower_bios_french10k_5['tweets'] > 99]

#print the number of rows in the filtered df
print(f"Number of rows after removing all with less than 100 tweets: {follower_bios_french10k_5_100.shape[0]}")

#Print how many rows were Removed
print(f"Number of rows removed: {follower_bios_french10k_5.shape[0] - follower_bios_french10k_5_100.shape[0]}")

Number of rows after removing all with less than 100 tweets: 330710
Number of rows removed: 276520


In [101]:
#  Only include users with more than 25 followers
follower_bios_french10k_5_100_25 = follower_bios_french10k_5_100[follower_bios_french10k_5_100['followers'] > 24]

# Print the number of rows in the filtered DataFrame   
print(f"Number of rows after removing all with less than 25 followers: {follower_bios_french10k_5_100_25.shape[0]}")

# Print how many rows were removed
print(f"Number of rows removed: {follower_bios_french10k_5_100.shape[0] - follower_bios_french10k_5_100_25.shape[0]}")

Number of rows after removing all with less than 25 followers: 286434
Number of rows removed: 44276


In [None]:
# Now write the filtered_df3 to a csv file
follower_bios_french10k_5_100_25.to_csv('/home/livtollanes/NewData/workdata/follower_bios_french10k_5_100_25', index=False)

### Filterings
- Follow at least five brands
- sent at least 100 tweets
- have at least 25 followers
- sent at least five tweets in the first few months of the year the data was collected (maybe not relevant for this data - the twitter bios data does not contain the tweets. Only creation date for the profile)