In [235]:
import os
import pandas as pd
import numpy as np
from DatasetConversion import generate_all_csv
import string # operacje na obiektach typu str
import re     # wyrazenia regularne
import unicodedata # baza znakow wg Unicode
import textwrap # zawijania wierszy
import datetime # obiekty typu data i czas

# Załadowanie danych 

In [263]:
folder_boardgames = os.path.join("..", "data", "boardgames")

In [264]:
# Wczytanie danych z csv
Badges = pd.read_csv(os.path.join(folder_boardgames, "Badges.xml.csv"))
PostLinks = pd.read_csv(os.path.join(folder_boardgames, "PostLinks.xml.csv"))
Posts = pd.read_csv(os.path.join(folder_boardgames, "Posts.xml.csv"))
Tags = pd.read_csv(os.path.join(folder_boardgames, "Tags.xml.csv"))
Users = pd.read_csv(os.path.join(folder_boardgames, "Users.xml.csv"))
Votes = pd.read_csv(os.path.join(folder_boardgames, "Votes.xml.csv"))
Comments = pd.read_csv(os.path.join(folder_boardgames, "Comments.xml.csv"))
PostHistory = pd.read_csv(os.path.join(folder_boardgames, "PostHistory.xml.csv"))

# Operacje na lokalizacji

Sprawdzimy ilu użytkowników ma ustawioną lokalizację.

In [265]:
location = Users["Location"]
not_nan = location.count()
percenage = not_nan / location.shape[0] * 100
print("Użytkowników tego serwisu jest {}. \nAle tylko {} z nich ma ustawioną lokalizację co daje {:.2f}%".format(location.shape[0], not_nan, percenage))

Użytkowników tego serwisu jest 25257. 
Ale tylko 11139 z nich ma ustawioną lokalizację co daje 44.10%


Odrzucamy wartości z Nan i wyświetlamy początek naszego zbioru:

In [266]:
location = location.dropna()
location.head(10)

0                 on the server farm
1                      Corvallis, OR
2         Raleigh, NC, United States
3                          Plano, TX
4               Albuquerque, NM, USA
5    Retired Road, Richland, MO, USA
6            Cardiff, United Kingdom
7        New York, NY, United States
8                     El Cerrito, CA
9                      United States
Name: Location, dtype: object

In [267]:
location[location == "Earth"].count()

54

Są adresy proste, ale i skomplikowane. Niektórzy też mają ciekawe lokalizacje jak *farma serwerów* lub po prostu są obywatelami ZIEMI. 

Weźmy ostani człon adresu:

In [344]:
country = location.map(lambda x: x.split(', ')[-1])
country.head(10)

0    on the server farm
1                    OR
2         United States
3                    TX
4                   USA
5                   USA
6        United Kingdom
7         United States
8                    CA
9         United States
Name: Location, dtype: object

Stworzymy teraz dataset z krajami:

In [345]:
country = pd.DataFrame(country.reset_index(drop=True))

Zamiania skrótów na pełne nazwy dla ujednolicenia zapisu:

In [346]:
country.loc[country["Location"] == "UK", "Location"] = "United Kingdom"
country.loc[country["Location"] == "USA", "Location"] = "United States"

Takie są najbardziej popularne kraje:

In [347]:
country.value_counts().head(10)

Location      
United States     2045
United Kingdom     872
Canada             518
India              396
Germany            392
Australia          276
CA                 243
France             241
Netherlands        184
Sweden             153
dtype: int64

Ale USA sprawia najwięcej problemów:

In [348]:
regex_two_capitals = re.compile("[A-Z]{2}$")
regex_us = re.compile("United States")

In [281]:
states_id = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
len(states_id)

51

In [312]:
state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", 
               "California", "Colorado", "Connecticut", "District ", "of Columbia", 
               "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", 
               "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", 
               "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", 
               "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", 
               "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", 
               "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", 
               "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", 
               "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]
len(state_names)

56

In [313]:
def regex_apply_states(val):
    if val and (bool(regex_two_capitals.match(val) and val in states_id) or regex_us.findall(val) or val in state_names):
        return 'United States'
    else:
        return val

In [349]:
country['Location'] = country['Location'].apply(regex_apply_states)

Niemcy też mają różne nazwy:

In [350]:
country.loc[country["Location"] == "Deutschland", "Location"] = "Germany"

In [351]:
country['Location'].value_counts().head(10)

United States     4208
United Kingdom     872
Canada             518
Germany            469
India              396
Australia          276
France             241
Netherlands        184
Sweden             153
Brazil             141
Name: Location, dtype: int64

Liczba mieszkańców USA znacznie się powiększyła!

Przygotujmy ładną ramkę daych:

In [352]:
df = pd.DataFrame(country['Location'].value_counts().reset_index())
df.columns = ["CountryName", "TotalUsers"]

In [353]:
df

Unnamed: 0,CountryName,TotalUsers
0,United States,4208
1,United Kingdom,872
2,Canada,518
3,Germany,469
4,India,396
...,...,...
936,Lebanon,1
937,Seattle area,1
938,QATAR,1
939,Astoria,1


Mamy 994 unikalnych lokalizacji! Chociać niektórych chyba nie będziemy mogli zaznaczyć na mapie świata...

In [354]:
df[df.TotalUsers==1]

Unnamed: 0,CountryName,TotalUsers
242,GMT -8:00,1
243,Memphis TN,1
244,Internet Cloud,1
245,PR,1
246,تونس‎,1
...,...,...
936,Lebanon,1
937,Seattle area,1
938,QATAR,1
939,Astoria,1


Weźmy 20 pierwszych krajów w tym rankingu popularności:

In [355]:
df = df.head(20)

In [384]:
from pyecharts.charts import WordCloud

name = list(df.CountryName)
value = [int(i/10)*100  for i in df.TotalUsers]

wordcloud = WordCloud() 
wordcloud.add("Popular Countries", list(zip(name, value)), word_size_range=None)
wordcloud.render_notebook() 

# Mapa 1

Do narysowania mapy świata potrzebujemy kody poszczególnych krajów. Z pomocą przychodzi pakiet *pycountry_convert*!

In [323]:
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2

def get_continent(col):
    try:
        cn_a2_code =  country_name_to_country_alpha2(col)
    except:
        cn_a2_code = 'Unknown' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Unknown' 
    return (cn_a2_code, cn_continent)

In [None]:
df['Codes'] = df['CountryName'].apply(get_continent)
df['Country'] = df['Codes'].apply(lambda x: x[0])
df['Continet'] = df['Codes'].apply(lambda x: x[1])

Zobaczmy jak wyglądają nasze dane...

In [325]:
df

Unnamed: 0,CountryName,TotalUsers,Codes,Country,Continet
0,United States,4208,"(US, NA)",US,
1,United Kingdom,872,"(GB, EU)",GB,EU
2,Canada,518,"(CA, NA)",CA,
3,Germany,469,"(DE, EU)",DE,EU
4,India,396,"(IN, AS)",IN,AS
5,Australia,276,"(AU, OC)",AU,OC
6,France,241,"(FR, EU)",FR,EU
7,Netherlands,184,"(NL, EU)",NL,EU
8,Sweden,153,"(SE, EU)",SE,EU
9,Brazil,141,"(BR, SA)",BR,SA


Ziemia jest popularnym wyborem! Do pozostałych państw możemy uzyskać ich lokalizację za pomocą pakietu *geopy.geocoders*

In [326]:
#function to get longitude and latitude data from country name
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="my-application-1")

def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

In [None]:
df['Geolocate'] = df['CountryName'].apply(geolocate)
df['Latitude'] = df['Geolocate'].apply(lambda x: x[0])
df['Longitude'] = df['Geolocate'].apply(lambda x: x[1])

In [388]:
df

Unnamed: 0,CountryName,TotalUsers,Codes,Country,Continet,Geolocate,Latitude,Longitude
0,United States,4208,"(US, NA)",US,,"(39.7837304, -100.4458825)",39.78373,-100.445882
1,United Kingdom,872,"(GB, EU)",GB,EU,"(54.7023545, -3.2765753)",54.702354,-3.276575
2,Canada,518,"(CA, NA)",CA,,"(61.0666922, -107.9917071)",61.066692,-107.991707
3,Germany,469,"(DE, EU)",DE,EU,"(51.0834196, 10.4234469)",51.08342,10.423447
4,India,396,"(IN, AS)",IN,AS,"(22.3511148, 78.6677428)",22.351115,78.667743
5,Australia,276,"(AU, OC)",AU,OC,"(-24.7761086, 134.755)",-24.776109,134.755
6,France,241,"(FR, EU)",FR,EU,"(46.603354, 1.8883335)",46.603354,1.888334
7,Netherlands,184,"(NL, EU)",NL,EU,"(52.24764975, 5.541246849406163)",52.24765,5.541247
8,Sweden,153,"(SE, EU)",SE,EU,"(59.6749712, 14.5208584)",59.674971,14.520858
9,Brazil,141,"(BR, SA)",BR,SA,"(-10.3333333, -53.2)",-10.333333,-53.2


Teraz wreszcie możemy zobaczyć wynik naszej pracy na mapie świata!

In [330]:
# Create a world map to show distributions of users 
import folium
from folium.plugins import MarkerCluster
#empty map
world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)
#for each coordinate, create circlemarker of user percent
for i in range(len(df)):
        lat = df.iloc[i]['Latitude']
        long = df.iloc[i]['Longitude']
        radius=5
        popup_text = """Country : {}<br>
                    %of Users : {}<br>"""
        popup_text = popup_text.format(df.iloc[i]['Country'],
                                   df.iloc[i]['TotalUsers']
                                   )
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)
#show the map
world_map

# Mapa 2

In [332]:
from pyecharts.charts import Map,Geo
from pyecharts import options as opts
from pyecharts.globals import ThemeType

Tutaj lepiej usuńmy nieznane terytoria...

In [389]:
df1 = df[df.Country != "Unknown"]

In [390]:
countries=list(df['CountryName'])
totalnumber=list(df['TotalUsers'])

In [391]:
data_list = [[countries[i],totalnumber[i]] for i in range(len(countries))] 
map_1 = Map(init_opts=opts.InitOpts(width="1000px", height="460px")) 
map_1.add('Number of users', data_list, maptype='world', is_map_symbol_show=False)
map_1.set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 
map_1.set_global_opts( visualmap_opts=opts.VisualMapOpts(max_=1100000,is_piecewise=True,
                                                         pieces=[
                                                                {"min":0, "max":100}, 
                                                                {"min":100, "max":200},
                                                                {"min":200, "max":300},
                                                                {"min":300, "max":400},
                                                                {"min":400, "max":500},
                                                                {"min":500, "max":600},
                                                                {"min":600, "max":700},
                                                                {"min":700, "max":800},
                                                                {"min":800, "max":900},
                                                                {"min":900, "max":1000},
                                                                {"min":1000},
                                                             ]),
                                                             title_opts=opts.TitleOpts(
                                                             title='Users of Board Games',
                                                             subtitle='Top 20 countries',
                                                             pos_left='center',
                                                             padding=0,
                                                             item_gap=2,# gap between title and subtitle 
                                                             title_textstyle_opts= opts.TextStyleOpts(color='darkblue',
                                                             font_weight='bold',
                                                             font_family='Courier New',
                                                             font_size=30), 
                                                             subtitle_textstyle_opts= opts.TextStyleOpts(color='grey',
                                                             font_weight='bold',
                                                             font_family='Courier New',
                                                             font_size=13)), 
                                                             legend_opts=opts.LegendOpts(is_show=False))
map_1.render_notebook() 

Źródła:

https://towardsdatascience.com/how-to-make-a-coronavirus-world-map-in-python-734c9fd87195

https://towardsdatascience.com/using-python-to-create-a-world-map-from-a-list-of-country-names-cd7480d03b10