In [17]:
#load libraries for data wrangling
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import os
import dask.dataframe as dd
import csv
import importlib
import html
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import psutil


#load function files
import Functions 
from Functions import * 

In [2]:
importlib.reload(Functions)

<module 'Functions' from '/home/livtollanes/10.jan-thesis/Code/Functions.py'>

### Data from scratch - Pandas

In [10]:
#How is my data delimited?
path = '/home/livtollanes/SocialMarkers'
file = 'markers_followers_bios_2023-05-19.csv'



def print_lines(path, file, start_line=0, end_line=10):
    with open(f"{path}/{file}", 'r') as f:
        for i in range(end_line):
            line = f.readline()
            if i >= start_line:
                print(line)

print_lines(path, file)


#The data is comma delimited

twitter_id,id,screen_name,name,description,url,timestamp_utc,local_time,location,verified,protected,tweets,followers,friends,likes,lists,image,default_profile,default_profile_image,witheld_in_countries,witheld_scope

3342215494,3342215494,titisanogo8,Titi sanogo,Je crois en DIEU et à mon travail j'y arriverai.....,,1435017944,2015-06-23T00:05:44,"Ile-de-France, France",0,0,6,44,733,91,0,https://pbs.twimg.com/profile_images/1249394390029742081/xuVolLn6_normal.jpg,1,0,,

3115495713,3115495713,AndreDeybach,DEYBACH André,,,1427309108,2015-03-25T18:45:08,,0,0,0,1,40,0,0,https://pbs.twimg.com/profile_images/580803690757533697/pHNcCBLh_normal.jpg,1,0,,

244075010,244075010,matttownley1985,Matt Townley,"Hotelier, traveller, fan of all things hospitality, great food and fine wine! All views my own etc!!",,1296220595,2011-01-28T13:16:35,"Manchester, England",0,1,2535,772,1264,1251,7,https://pbs.twimg.com/profile_images/928075998930681856/ZFXboKc3_normal.jpg,0,0,,

2986463442,2986463442,alex_guev

In [43]:
#Loading in one data set
path = '/home/livtollanes/SocialMarkers'
file = 'markers_followers_bios_2023-05-19.csv'

def load_file(path, file, req_cols):
    return pd.read_csv(f"{path}/{file}", delimiter=',', quotechar='"', low_memory=False, usecols=req_cols)


req_cols = ['twitter_id', 'id', 'screen_name', 'description', 'location', 'tweets', 'followers', 'friends', 'likes', 'lists','timestamp_utc']

# dtypes = {
#     'twitter_id': 'int64',
#     'id': 'float64',
#     'screen_name': 'object',
#     'description': 'object',
#     'location': 'object',
#     'tweets': 'float64',
#     'followers': 'float64',
#     'friends': 'float64',
#     'witheld_in_countries': 'float64'
# }

df = load_file(path, file, req_cols)

In [41]:
# How much memory available?
def get_available_memory():
    return psutil.virtual_memory().available

available_memory = get_available_memory()
print(f"Available memory: {available_memory / (1024 * 1024 * 1024)} GB")


# check size of loaded df and print 
def get_df_memory_usage(df):
    return df.info(memory_usage='deep')

get_df_memory_usage(df)

Available memory: 67.46749496459961 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70666646 entries, 0 to 70666645
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   twitter_id   int64  
 1   id           float64
 2   screen_name  object 
 3   description  object 
 4   location     object 
 5   tweets       float64
 6   followers    float64
 7   friends      float64
 8   likes        float64
 9   lists        float64
dtypes: float64(6), int64(1), object(3)
memory usage: 16.6 GB


#### Data inspection

In [44]:
df.head()

#insopect min and max value of column friends with f string
# print(f"Min friends: {df.friends.min()}")
# print(f"Max friends: {df.friends.max()}")


#check for multiple lines per user
duplicates = df.duplicated(subset='twitter_id', keep=False)
print(f"Number of duplicate rows: {duplicates.sum()}")



Number of duplicate rows: 0


### Filtering out irrelevant users based on follower count

In [1]:
#How is my data delimited?
path = '/home/livtollanes/SocialMarkers'
file = 'markers_followers_2023-05-19.csv'



def print_lines(path, file, start_line=0, end_line=10):
    with open(f"{path}/{file}", 'r') as f:
        for i in range(end_line):
            line = f.readline()
            if i >= start_line:
                print(line)

print_lines(path, file)


id,cursor,follower_id

415859364,,1655336804831174657

415859364,,1659648141497454593

415859364,,1525534139478310915

415859364,,1659648883209674764

415859364,,1659648836594458626

415859364,,881616301

415859364,,1618696506424303637

415859364,,1659647872202055692

415859364,,1659647973637357568



In [3]:
import csv
from collections import defaultdict

# Create a defaultdict of sets. This will automatically create a new set for each new key.
brands_per_follower = defaultdict(set)

# Open the CSV file
with open('/home/livtollanes/SocialMarkers/markers_followers_2023-05-19.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        # Add the brand id to the set of brands for this follower
        brands_per_follower[row['follower_id']].add(row['id'])

# Now convert the sets to counts
brands_per_follower = {follower_id: len(brands) for follower_id, brands in brands_per_follower.items()}

In [4]:
#inspect first key value pairs 
# Get an iterator over the dictionary's items
items = iter(brands_per_follower.items())

# Get the first 5 items
for _ in range(5):
    print(next(items))

    

('1655336804831174657', 4)
('1659648141497454593', 3)
('1525534139478310915', 3)
('1659648883209674764', 1)
('1659648836594458626', 1)


In [5]:
# do I have multiples for the same key?
def has_duplicate_nids(dictionary):
    nids = {}
    for key, value in dictionary.items():
        follower_nid = value  # replace this with actual way to get follower nid
        if follower_nid in nids:
            return True
        else:
            nids[follower_nid] = key
    return False

('1659641273198714887', 1)
('3293231873', 1)
('1494746709041287169', 1)
('1658482798590853122', 1)
('1659647834214244353', 1)
