In [15]:
#load libraries for data wrangling
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import os
import dask.dataframe as dd
import csv
import importlib


#load function files
import Functions 
from Functions import * 

In [16]:
importlib.reload(Functions)

<module 'Functions' from '/home/livtollanes/10.jan-thesis/Code/Functions.py'>

### Data inspection

In [17]:
data_path = "/home/livtollanes/SocialMarkers"
data = load_data(data_path)

#what does data contain?
data.keys()


dict_keys(['marker_bios', 'marker_followers_bios', 'marker_followers', 'marker_friends', 'readme'])

In [18]:
#Print the first few rows of each DataFrame
for name, df in data.items():
    print(f"First few rows of '{name}':")
    if isinstance(df, str):
        print(df)
    else:
        print(df.head())

First few rows of 'marker_bios':
  twitter_name           id screen_name           name  \
0         Nike  415859364.0        Nike           Nike   
1     adidasFR   25487201.0    adidasFR  adidas France   
2      Lacoste   18481641.0     Lacoste        Lacoste   
3     Converse  183398746.0    Converse       Converse   
4      Moncler  187897893.0     Moncler        Moncler   

                                         description  \
0               #BlackLivesMatter and #StopAsianHate   
1                       𝕭𝖎𝖊𝖓𝖛𝖊𝖓𝖚𝖊 𝖆𝖚 𝕮𝖑𝖚𝖇 𝕺𝖗𝖎𝖌𝖎𝖓𝖆𝖑𝖘.   
2  Moving with the world for 90 years. 🐊 #Lacoste...   
3  Get inspired by the next generation of artists...   
4  Born in the mountains, lives in the city. The ...   

                                      url  timestamp_utc           local_time  \
0                         http://nike.com   1.321655e+09  2011-11-18T22:31:18   
1    http://adidas.fr/impossibleisnothing   1.237546e+09  2009-03-20T10:46:03   
2                 http://laco.st/HLXGd

In [None]:
#Print the first 10 rows of the 'description' and 'location' in the 'marker_bios' and 'marker_followers_bios' DataFrames
for name, df in data.items():   
    if name == 'marker_bios' or name == 'marker_followers_bios':
        print(f"First 10 rows of '{name}':")
        print(df[['description','location']].head(10))  

In [None]:
# Inspect column types
print(data['marker_bios']['description'].dtype)
print(data['marker_bios']['location'].dtype)

# Inspect column types for marker_followers_bios
print(data['marker_followers_bios']['description'].dtype)
print(data['marker_followers_bios']['location'].dtype)

In [None]:
#print and compute the number of unique locations in marker_followers_bios
print('Number of unique locations in marker_bios: {}'.format(len(data['marker_bios']['location'].unique())))
#make a list of unique locations in marker_bios
unique_locations = data['marker_bios']['location'].unique()


print(unique_locations.compute())

In [None]:
# count the number of unique locations in marker_followers_bios
print('Number of unique locations in marker_followers_bios: {}'.format(len(data['marker_followers_bios']['location'].unique())))

In [None]:
#inspect row number 287456 in marker_followers_bios
print(data['marker_followers_bios'].iloc[287456])

In [None]:
#print Number of unique locations in marker_bios
print('Number of unique locations in marker_bios: {}'.format(len(data['marker_bios']['location'].unique())))


In [45]:
for name in ['marker_bios', 'marker_followers_bios']:
    dask_df = data[name]
    num_rows = dask_df.shape[0].compute()
    print(f'The total number of rows in {name} is {num_rows}.')

The total number of rows in marker_bios is 237.


ParserError: Error tokenizing data. C error: EOF inside string starting at row 287380

#### Chunked data

In [7]:
#Inspecting the chunked data
chunks = load_data_in_chunks(data_path, chunksize= 1000)
chunks.keys()



# Inspect the first chunk of each dataset
for name, chunk_generator in chunks.items():
    if name != 'readme':
        print(f"First chunk of {name}:")
        print(next(chunk_generator))
    else:
        print(f"Contents of {name}:")
        print(chunk_generator)

First chunk of bios:
      twitter_name          id    screen_name             name  \
0             Nike   415859364           Nike             Nike   
1         adidasFR    25487201       adidasFR    adidas France   
2          Lacoste    18481641        Lacoste          Lacoste   
3         Converse   183398746       Converse         Converse   
4          Moncler   187897893        Moncler          Moncler   
..             ...         ...            ...              ...   
232      Cdiscount    63142684      Cdiscount        Cdiscount   
233           Fnac     8806412           Fnac             Fnac   
234  rueducommerce    16933084  rueducommerce  Rue du Commerce   
235         vinted  2228641753         vinted           Vinted   
236   AmazonFrance  3823963875   AmazonFrance        Amazon.fr   

                                           description  \
0                 #BlackLivesMatter and #StopAsianHate   
1                         𝕭𝖎𝖊𝖓𝖛𝖊𝖓𝖚𝖊 𝖆𝖚 𝕮𝖑𝖚𝖇 𝕺𝖗𝖎𝖌𝖎𝖓𝖆𝖑𝖘.   
2    Moving 

In [8]:
chunks.keys()

dict_keys(['bios', 'followers_bios', 'followers', 'friends', 'readme'])

In [24]:


def inspect_problematic_row(data_path, key, row_number):
    chunksize = 1000  # Adjust as needed
    file_path = data_path + '/' + key + '.csv'
    i = 0
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        chunk.reset_index(inplace=True)
        if row_number < (i + 1) * chunksize and row_number >= i * chunksize:
            if row_number % chunksize < len(chunk):
                return chunk.loc[row_number % chunksize], row_number
        i += 1

# Usage
key = 'markers_followers_bios_2023-05-19'
row_number = 287131
problematic_row, original_row_number = inspect_problematic_row(data_path, key, row_number)
print(f"Original row number: {original_row_number}")
print("Row data:")
print(problematic_row)

Original row number: 287131
Row data:
index                                                               287131
twitter_id                                             1614030263431974912
id                                                     1614030263431974912
screen_name                                                    PatriceVrai
name                                                       Patrice le vrai
description                                                            NaN
url                                                                    NaN
timestamp_utc                                                   1673649794
local_time                                             2023-01-13T22:43:14
location                                                               NaN
verified                                                                 0
protected                                                                0
tweets                                                        

In [None]:
import pandas as pd

def inspect_problematic_row(data_path, key, row_number):
    chunksize = 10000  # Adjust as needed
    file_path = data_path + '/' + key + '.csv'
    i = 0
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        if row_number < (i + 1) * chunksize:
            return chunk.loc[row_number % chunksize]
        i += 1

# Usage
data_path = "/home/livtollanes/SocialMarkers"
key = 'marker_followers_bios'
row_number = 287131
problematic_row = inspect_problematic_row(data_path, key, row_number)
print(problematic_row)

### Data merging for french entries only

Total number of rows in followers_bios: 0
