# Profiling Image-based Social Sensing Dataset 

In [None]:
import io
import pandas as pd

missing = ["-0"]
df = pd.read_csv('datasets/162_social_distancing_week_34_results_geoloc_ok.csv', na_values=missing)

#assign a name to the first column that is unamed
#df = df.rename(columns={df.columns[0]: 'line'})

In [None]:
#print the columns names with their index in order to make it easier to reference them later
cols = list(df.columns)
length = len(df[cols[0]])

i = 0
while(i<len(cols)):
    print(i,cols[i])
    i+=1

In [None]:
#read a dataset that contains data about all the countries in the world and select only the columns relative to the codes
countries = pd.read_csv('datasets/ISO 3166-2  countries.csv')
countries_codes = countries[["alpha-2","alpha-3"]]

#read dataset that contains association between language code and country code
languages = pd.read_csv('datasets/ietf-language-tags_csv.csv')
languages = languages[["langType","territory"]]

#integrate the two datasets
languages = pd.merge(languages, countries_codes, how="inner", left_on=['territory'], right_on=['alpha-2'])
languages = languages.drop_duplicates().dropna()

#for cleaness select only the columns relative to the post id, the alpha-3 country code and the language code
df_reduced = df[[cols[22]] + [cols[27]] + [cols[32]]]

In [None]:
#left join the data frame about the posts with the data frame with all the languages spoken in each country
languages_merged = pd.merge(df_reduced, languages, how="left",left_on=['info_country_code'], right_on=['alpha-3'])
#select the rows that did not match
languages_not_matching = languages_merged[languages_merged.isnull().any(axis=1)]
languages_not_matching

In [None]:
#compute a ratio for language in the post not matching country's language
df_length = len(df[cols[0]])
not_matching_ratio = len(languages_not_matching[languages_not_matching.columns[0]]) / df_length
print("Posts where the language doesn't match the country's languages: "+str(100*not_matching_ratio)[0:5]+" %")

## Completeness

In [None]:
#compl is the compleateness measure for each column
compl = 100 *(length - df.isnull().sum()) / length
#to have the columns sorted
#compl = compl.sort_values(ascending=False)

#Keep only the rows that are incomplete
compl = compl[compl < 100]

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=14)
figure(figsize=(5, 7), dpi=200)
ax = plt.axes()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)


if (compl.size == 0):
    print("All columns are 100% complete")
else:
    #print incomplete columns
    print("Completeness:")
    for i, v in enumerate(compl):
        plt.text(v + 1, i-0.25, str(v)[0:5] +"%", color='#0C764F',fontsize=10)
print(compl.plot.barh())
plt.tight_layout()
plt.xlim([0, 145])
#ax.set_facecolor("#FCF7EE")
plt.savefig("imgs/completeness_1.png")    

In [None]:
#create another dataframe where also Not Answered counts as null
missing = ["-0","Not answered"]
df_2 = pd.read_csv('datasets/162_social_distancing_week_34_results_geoloc_ok.csv', na_values=missing)

#compl_2 is the compleateness measure for each column considering Not Answerd as null
compl_2 = 100 *(length - df_2.isnull().sum()) / length
#compl_2 = compl_2.sort_values(ascending=False)

#Keep only the rows that are incomplete
compl_2 = compl_2[compl_2 < 100]

In [None]:
#convert Series to Dataframe
compl_df = compl.to_frame().reset_index()
compl_2_df = compl_2.to_frame().reset_index()

#rename columns for joining
compl_df = compl_df.rename(columns={compl_df.columns[0]: 'field'})
compl_df = compl_df.rename(columns={compl_df.columns[1]: 'completeness_1'})
compl_2_df = compl_2_df.rename(columns={compl_2_df.columns[0]: 'field'})
compl_2_df = compl_2_df.rename(columns={compl_2_df.columns[1]: 'completeness_2'})

compl_merged = pd.merge(compl_df, compl_2_df, how="inner", left_on=['field'], right_on=['field'])
#select only the columns where the completeness measure changes over the two datasets
compl_diff = compl_merged.loc[compl_merged["completeness_1"] != compl_merged["completeness_2"]]

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=14)
figure(figsize=(5, 7), dpi=200)
ax = plt.axes()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)


if (compl_2.size == 0):
    print("All columns are 100% complete")
else:
    print("Completeness:")
    for i, v in enumerate(compl_2):
        #if an attribute has a different completeness measure when considering Not Answered as null print it in another color
        if((compl_diff[compl_diff["field"].str.contains(compl_2.axes[:][0][i])]).size):
            plt.text(v + 1, i-0.25, str(v)[0:5] +"%", color='#990D00',fontsize=10)
        else:
            plt.text(v + 1, i-0.25, str(v)[0:5] +"%", color='#0C764F',fontsize=10)
print(compl_2.plot.barh())
plt.tight_layout()
plt.xlim([0, 145])
#ax.set_facecolor("#FCF7EE")
plt.savefig("imgs/completeness_2.png")

## Coverage

In [None]:
#Count how many posts for each country
post_per_country = df.groupby(by="info_country_code").count().reset_index()
post_per_country = post_per_country.iloc[:,0:2]
post_per_country = post_per_country.rename(columns={post_per_country.columns[1]: 'Posts'})
post_per_country = post_per_country.sort_values("Posts",ascending=False)
post_per_country = post_per_country.rename(columns={post_per_country.columns[0]:"Country Code"})
post_per_country

In [None]:
#load a dataset with the population in each country
world_pop = pd.read_csv('datasets/API_SP.POP.TOTL_DS2_en_csv_v2_2445260.csv')
world_pop = world_pop[list(world_pop.columns[0:2]) + [world_pop.columns[-2]]]
world_pop = world_pop.rename(columns={world_pop.columns[2]:"Population"})

posts_population = post_per_country.set_index("Country Code").join(world_pop.set_index("Country Code"))
columns_titles = ["Country Name", "Posts","Population","Posts_Per_Person"]
posts_population=posts_population.reindex(columns=columns_titles)

#compute the posts per person
posts_population["Posts_Per_Person"] = posts_population["Posts"] / posts_population["Population"]
posts_population_ratio = posts_population.sort_values("Posts_Per_Person",ascending=False)

#remove countries with small population since with few data they easily become outliers
posts_population_ratio = posts_population_ratio.loc[posts_population_ratio["Population"] > 100000]

posts_population_ratio

### Plot the maps

In [None]:
import geopandas as gpd

# set the filepath and load
fp = "world_map/ne_110m_admin_0_countries.shp"
map_df = gpd.read_file(fp)
post_per_country_map = map_df.set_index("ADM0_A3").join(posts_population_ratio)

In [None]:
# set a variable that will call whatever column we want to visualise on the map
variable = "Posts"
# set the range for the choropleth
vmin, vmax = posts_population_ratio.Posts.max(), posts_population_ratio.Posts.min()


# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(17, 6),dpi=200)

post_per_country_map.plot(column=variable, cmap="BuGn", linewidth=0.8, ax=ax, edgecolor="0.8")

# add a title
ax.set_title("Posts per country", fontdict={"fontsize": "25", "fontweight" : "3"})

# Create colorbar as a legend
sm = plt.cm.ScalarMappable(cmap="BuGn", norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm._A = []
# add the colorbar to the figure
fig.colorbar( sm,shrink=0.9)
plt.tight_layout()
plt.savefig("imgs/posts per country.png")

In [None]:
# set a variable that will call whatever column we want to visualise on the map
variable = "Posts_Per_Person"
# set the range for the choropleth
vmin, vmax = posts_population_ratio.Posts_Per_Person.max(), posts_population_ratio.Posts_Per_Person.min()


# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(17, 6),dpi=200)

post_per_country_map.plot(column=variable, cmap="BuGn", linewidth=0.8, ax=ax, edgecolor="0.8")

# add a title
ax.set_title("Posts per person", fontdict={"fontsize": "25", "fontweight" : "3"})

# Create colorbar as a legend
sm = plt.cm.ScalarMappable(cmap="BuGn", norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm._A = []
#add colorbar
fig.colorbar( sm,shrink=0.9)
#fig.set_facecolor("#FCF7EE")
#ax.set_facecolor("#FCF7EE")
fig.tight_layout()
fig.savefig("imgs/posts per person.png")

## Dependencies

In [None]:
#set the cells with data to 1, the ones with no data to 0
ds = df_2[cols[4:19]]
invert_one_zero = {0: 1,1: 0}

ds = ds.isnull().astype(int)
ds =ds.replace(invert_one_zero)
ds

In [None]:
#compute the correlation matrix
ds.corr()

In [None]:
import seaborn as sns
import numpy as np

#print the correlation matrix
f, ax = plt.subplots(figsize=(10, 8))
corr = ds.corr(method ='pearson')
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.savefig("imgs/correlation.png")

 ## Miscellaneous

In [None]:
#check if to each country is associated only one country code
a = df[cols[26:29]]
#a.groupby(['info_country']).info_country_code.nunique()
tab = a.groupby(['info_country_or_territory']).info_country_code.nunique().reset_index()
tab[(tab["info_country_code"] > 1)]

## Broken Links

Since often during the computation the program hangs, it's necessasry to restart it. In order to not restart from the beginning the progress is saved in a file *broken_links.pkl*. To start from the beginnnig it is necessary to delete the file.

In [None]:
import requests
import time
import validators
from IPython.display import display, clear_output
import json
import pickle


links = df["info_media_url"]

try:
    with open('broken_links.pkl', "rb") as f:
        line, broken_links = pickle.load(f)
except:
    broken_links = 0
    line = 0
#in case I want to print/save the broken links    
broken_links_list = []

for link in links[line:]:
    #remove previous prints
    clear_output(wait=True)
    
    print("Reading line "+str(line + 1)+ " of "+str(length)+"\nBroken links:"+ str(broken_links))
    
    #check is the url is valid
    if not validators.url(link):
        broken_links +=1
        line+=1
        with open('broken_links.pkl', "wb") as f:
            pickle.dump([line, broken_links], f)
        continue
    
    #check if the pic is still available    
    response = requests.get(link)
    if(response.status_code > 400):
        broken_links +=1
        broken_links_list.append(link)
        
    
    line+=1

    with open('broken_links.pkl', "wb") as f:
         pickle.dump([line, broken_links], f)
    #Twitter doesn't allow more than 1 request per second
    time.sleep(1)

    
if(line == length):
    print("Broken links: "+ str(broken_links)+ " of "+str(length)+" lines")


## Pandas Profiling

In [None]:
#import pandas_profiling
#pandas_profiling.ProfileReport(df)