### Challenge
This year Kaggle is launching the second annual Data Science Survey Challenge, where we will be awarding a prize pool of $30,000 to notebook authors who tell a rich story about a subset of the data science and machine learning community.

In [19]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly as py

In [20]:
dataset = pd.read_csv('./datasets/kaggle-survey-2019/multiple_choice_responses.csv')

In [21]:
temp_y = dataset['Q5'].value_counts().to_list()
temp_x = dataset['Q5'].value_counts().index.values

temp_x, temp_y = zip(*sorted(zip(temp_x,temp_y)))

import plotly.graph_objects as go

fig = go.Figure(data=[go.Bar(
    y=temp_y,
    x=temp_x,
)])
fig.update_layout(title_text='Count of Ages')

In [22]:
unemp = dataset[dataset['Q5'].isin(["Not employed"])]
unemp.head()

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q2_OTHER_TEXT,Q3,Q4,Q5,Q5_OTHER_TEXT,Q6,Q7,...,Q34_Part_4,Q34_Part_5,Q34_Part_6,Q34_Part_7,Q34_Part_8,Q34_Part_9,Q34_Part_10,Q34_Part_11,Q34_Part_12,Q34_OTHER_TEXT
42,334,22-24,Male,-1,India,Master’s degree,Not employed,-1,,,...,,,,,,,,,,-1
43,578,22-24,Male,-1,India,Master’s degree,Not employed,-1,,,...,,,,,,,,,,-1
112,677,25-29,Female,-1,Morocco,Master’s degree,Not employed,-1,,,...,,,,,,,,,,-1
117,386,22-24,Male,-1,Greece,Master’s degree,Not employed,-1,,,...,,,,,,,,,,-1
127,349,25-29,Male,-1,Other,No formal education past high school,Not employed,-1,,,...,,,,,,,,,,-1


In [24]:
temp_y = unemp['Q2'].value_counts().to_list()
temp_x = unemp['Q2'].value_counts().index.values

temp_x, temp_y = zip(*sorted(zip(temp_x,temp_y)))

import plotly.graph_objects as go

fig = go.Figure(data=[go.Bar(
    y=temp_y,
    x=temp_x,
)])
fig.update_layout(title_text='Count of Ages')

In [4]:
#dataset.columns = dataset.iloc[0] 
dataset = dataset.drop(dataset.index[0]) #
dataset = dataset[~dataset['Q5'].isin(["Student", "Not employed"])]

dataset.head()


Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q2_OTHER_TEXT,Q3,Q4,Q5,Q5_OTHER_TEXT,Q6,Q7,...,Q34_Part_4,Q34_Part_5,Q34_Part_6,Q34_Part_7,Q34_Part_8,Q34_Part_9,Q34_Part_10,Q34_Part_11,Q34_Part_12,Q34_OTHER_TEXT
1,510,22-24,Male,-1,France,Master’s degree,Software Engineer,-1,"1000-9,999 employees",0,...,,,,,,,,,,-1
2,423,40-44,Male,-1,India,Professional degree,Software Engineer,-1,"> 10,000 employees",20+,...,,,,,,,,,,-1
3,83,55-59,Female,-1,Germany,Professional degree,,-1,,,...,,,,,,,,,,-1
4,391,40-44,Male,-1,Australia,Master’s degree,Other,0,"> 10,000 employees",20+,...,,,,,,Azure SQL Database,,,,-1
5,392,22-24,Male,-1,India,Bachelor’s degree,Other,1,0-49 employees,0,...,,,,,,,,,,-1


In [5]:
# change all NaN to nulls
dataset = dataset.where(pd.notnull(dataset), None)
dataset.count()

Time from Start to Finish (seconds)    14761
Q1                                     14761
Q2                                     14761
Q2_OTHER_TEXT                          14761
Q3                                     14761
                                       ...  
Q34_Part_9                               479
Q34_Part_10                              526
Q34_Part_11                             1245
Q34_Part_12                              287
Q34_OTHER_TEXT                         14761
Length: 246, dtype: int64

In [6]:
dataset['Q1'].value_counts()

25-29    3597
30-34    2787
22-24    2105
35-39    1923
40-44    1327
45-49     894
18-21     707
50-54     650
55-59     377
60-69     307
70+        87
Name: Q1, dtype: int64

In [7]:
temp_y = dataset['Q1'].value_counts().to_list()
temp_x = dataset['Q1'].value_counts().index.values

temp_x, temp_y = zip(*sorted(zip(temp_x,temp_y)))

In [8]:
print(temp_x)
print(temp_y)

('18-21', '22-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-69', '70+')
(707, 2105, 3597, 2787, 1923, 1327, 894, 650, 377, 307, 87)


In [9]:
import plotly.graph_objects as go

colors = ['lightslategray',] * 5
colors[2] = 'crimson'

fig = go.Figure(data=[go.Bar(
    y=temp_y,
    x=temp_x,
    marker_color=colors # marker color can be a single color value or an iterable
)])
fig.update_layout(title_text='Count of Ages')

In [10]:
import pycountry

def get_name(code):
    '''
    Translate code to name of the country
    '''
    try:
        name = pycountry.countries.get(name=code).alpha_3
    except:
        try:
            name = pycountry.countries.get(official_name=code).alpha_3
        except:
            name = code
    return name

In [11]:
temp_y = dataset['Q3'].value_counts().to_list()
temp_x = dataset['Q3'].value_counts().index.values

d = {'country':temp_x,'count':temp_y}
tmp_df = pd.DataFrame(d)
tmp_df['country_iso'] = tmp_df['country'].apply(lambda c: get_name(c))
tmp_df.head(5)

Unnamed: 0,country,count,country_iso
0,India,2972,IND
1,United States of America,2489,USA
2,Other,830,Other
3,Brazil,606,BRA
4,Japan,580,JPN


In [12]:
fig = go.Figure(data=go.Choropleth(
    locations = tmp_df['country_iso'],
    z = tmp_df['count'],
    text = tmp_df['country'],
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '$',
    colorbar_title = 'number<br> of countries',
))
fig.show()


In [13]:
temp_y = dataset['Q4'].value_counts().to_list()
temp_x = dataset['Q4'].value_counts().index.values

d = {temp_x,temp_y}
tmp_df = pd.DataFrame(d)
tmp_df.head()

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
d = {}

for i in range(1,12):
    temp_y = dataset['Q13_Part_'+str(i)].value_counts().to_list()
    temp_x = dataset['Q13_Part_'+str(i)].value_counts().index.values
    x = str(temp_x[0])
    y = str(temp_y[0])
    x = {x:y}
    d.update(x)
    
tmp_df = pd.DataFrame(d.items(),columns=["education","count"])
print(tmp_df)