In [2]:
import pandas as pd

In [3]:
# Load data
codebook = pd.read_csv("developer-survey-2018/HackerRank-Developer-Survey-2018-Codebook.csv")
numeric_mapping = pd.read_csv("developer-survey-2018/HackerRank-Developer-Survey-2018-Numeric-Mapping.csv")
country_mapping = pd.read_csv("developer-survey-2018/Country-Code-Mapping.csv")

# Information about data
print("Codebook        shape: ", codebook.shape)
print("Numeric mapping shape: ", numeric_mapping.shape)
print("Country_mapping shape: ", country_mapping.shape)

Codebook        shape:  (251, 3)
Numeric mapping shape:  (452, 3)
Country_mapping shape:  (152, 2)


In [5]:
# Load data with respect to different types of data in some columns
numeric = pd.read_csv("developer-survey-2018/HackerRank-Developer-Survey-2018-Numeric.csv", low_memory=False)
values = pd.read_csv("developer-survey-2018/HackerRank-Developer-Survey-2018-Values.csv", low_memory=False)

# Information about data
print("Numeric shape: ", numeric.shape)
print("Values  shape: ", values.shape)

Numeric shape:  (25090, 251)
Values  shape:  (25090, 251)


In [6]:
# Let's watch on codebook to see column's description
codebook.head(15)

Unnamed: 0,Data Field,Survey Question,Notes
0,RespondentID,,Respondent ID
1,StartDate,,When did they start (date and time)
2,EndDate,,When did they end (date and time)
3,CountryNumeric2,,see Country-Code-Mapping.csv
4,q1AgeBeginCoding,At what age did you start coding,
5,q2Age,How old are you now?,
6,q3Gender,What gender do you identify with?,
7,q4Education,What is the highest level of education you hav...,"Shown if person chooses ""Some College"" or above"
8,q0004_other,Other (please specify),"Shown if person chooses ""Some College"" or above"
9,q5DegreeFocus,What is the focus area of your degree?,


## 1. Какое число опрошенных программистов из России?

In [7]:
# Count values for each country and show first 15
print(values["CountryNumeric2"].value_counts()[:15])

India                 8088
United States         4937
Canada                 642
Brazil                 502
United Kingdom         443
Indonesia              387
Russian Federation     378
Germany                258
Turkey                 250
Poland                 248
Ukraine                187
Singapore              183
Mexico                 181
Bangladesh             174
Romania                174
Name: CountryNumeric2, dtype: int64


In [8]:
first_result = values["CountryNumeric2"].value_counts().to_dict()["Russian Federation"]
print("Answer: ", first_result)

Answer:  378


## 2. Какой процент от программистов из России – студенты?

In [9]:
# Select education rows from values where country is Russian Federation
rus_edu = values[values["CountryNumeric2"] == "Russian Federation"]["q4Education"]
rus_edu.head()

105                                    #NULL!
213                      High school graduate
214    Some post graduate work (Masters, PhD)
215    Some post graduate work (Masters, PhD)
216    Some post graduate work (Masters, PhD)
Name: q4Education, dtype: object

In [10]:
# Let's look on unique values in rus_edu
rus_edu.value_counts()

College graduate                          111
Post graduate degree (Masters, PhD)        98
Some post graduate work (Masters, PhD)     89
High school graduate                       42
Some college                               23
Some high school                           12
#NULL!                                      2
Vocational training (like bootcamp)         1
Name: q4Education, dtype: int64

In [11]:
# Now we can say that wee need 'Some college' or 'Some high school', let's sum it
rus_dict = rus_edu.value_counts().to_dict()
second_result = rus_dict['Some high school'] + rus_dict['Some college']
print("Answer: {} %".format(second_result / len(rus_edu) * 100))

Answer: 9.25925925925926 %


## 3. Какой процент от общего числа опрошенных сейчас занимается Data Science? Web-разработкой? Мобильной разработкой?

In [28]:
# Look at values in column
field_dict = values["q0005_other"].value_counts().to_dict()
field_dict

{"'new' media": 1,
 '(graphic design/computer science)': 1,
 '10th class': 1,
 '2 degrees, MS CS, MA Psychology': 1,
 '3 Degrees: MBA - Business; BS - Computer Science; BBA - Finance': 1,
 '3D Animation': 1,
 'AAS - CAD': 1,
 'AI': 1,
 'ARTIFICIAL INTELLIGENCE': 1,
 'Accountancy': 1,
 'Accounting': 7,
 'Accounting, Computers, and minor in Finance': 1,
 'Accounting, Finance': 1,
 'Acoustics, Music Theory': 1,
 'Acting': 1,
 'Actuarial science': 1,
 'Aero space engineering': 1,
 'Aeronautical engineering': 1,
 'Aeronautics research': 1,
 'Agriculture': 1,
 'Airlines management': 1,
 'Android and iOS developement': 1,
 'Anthropology': 3,
 'Anthropology/History': 1,
 'Anything in information security like penetration testing or computer security consulting': 1,
 'Applied Geophysics': 1,
 'Applied Linguistics': 1,
 'Applied Mathematics and Law (double degree)': 1,
 'Apps and games development': 1,
 'Archaeology': 1,
 'Architecture': 8,
 'Art': 6,
 'Art History': 1,
 'Art History, Classical 

In [29]:
# Now we will try to get keys for our filtering
web_keys    = []
datas_keys  = []
mobile_keys = []

for k in field_dict:
    kl = k.lower()
    if "web" in kl:
        web_keys.append(kl)
    elif "data" in kl or "machine" in kl or "learning" in kl or "artificial" in kl or "ai" in kl:
        datas_keys.append(kl)
    elif "mobile" in kl or "ios" in kl or "andriod" in kl:
        mobile_keys.append(kl)
        
print("Web keys:      ", web_keys, "\n")
print("Data sc. keys: ", datas_keys, "\n")
print("Mobile keys:   ", mobile_keys, "\n")

Web keys:       ['software & web development', 'web', 'finance,banking,income tax officer,web developer', 'web design and development', 'it applied to web and multimedia design / it applied to digital animation', 'full stack web development', 'web development', 'web developement', 'full stack web developer', 'web development', 'web development'] 

Data sc. keys:  ['computer science and machine learning', 'data science', 'data comms and software engineering', 'machine learning and ai', 'data analytics, finance', 'biomedical (data-science)', 'ba - philosophy , ms - data analytics', 'data science', 'international affairs', 'data science machine learning', 'computer science and artificial intelligence', 'supply chain, history', 'deep learning and a.i.', 'ai', 'machine learning', 'data science', 'big data and data analytics', 'software engineering, ai, machine learning', 'learning technologies', 'technology, database', 'machine learning, python, ios development and networking', 'artificial 

In [30]:
# Specify not valid keys for each field
web_bad_keys    = []
datas_bad_keys  = ['technology, database', 'data comms and software engineering', 'airlines management', 'databases'
                  'international affairs', 'supply chain, history']
mobile_bad_keys = ['dip in automobile engg']

# Resulting variables
web_result = 0
datas_result = 0
mobile_result = 0

# Count results
for k in field_dict:
    kl = k.lower()
    if kl in web_keys and kl not in web_bad_keys:
        web_result += field_dict[k]
    elif kl in datas_keys and kl not in datas_bad_keys:
        datas_result += field_dict[k]
    elif kl in mobile_keys and kl not in mobile_bad_keys:
        mobile_result += field_dict[k]

print("Web:                {} %".format(web_result / len(field_dict) * 100))
print("Data science:       {} %".format(datas_result / len(field_dict) * 100))
print("Mobile Development: {} %".format(mobile_result / len(field_dict) * 100))

Web:                1.5053763440860215 %
Data science:       3.4408602150537635 %
Mobile Development: 0.21505376344086022 %


## 4. Какой процент от программистов из России правильно ответили на каждый вопрос?

In [31]:
# Look in which columns we expect answers
codebook["Notes"].value_counts().to_dict()

{'Respondent ID': 1,
 'Shown if person chooses "Some College" or above': 2,
 'Shown if q33HackerRankChallforJob == Yes': 1,
 'Shown if q33HackerRankChallforJob == Yes; rating of 0-5': 1,
 'When did they end (date and time)': 1,
 'When did they start (date and time)': 1,
 'right answer: 1 or \'prints  "Hello World!" n times\'': 1,
 'right answer: 1 or C': 1,
 'right answer: 1 or num%2 == 0': 1,
 'right answer: 2 or Queue': 1,
 'see Country-Code-Mapping.csv': 1,
 'shown if q16HiringManager == Yes': 85,
 'shown if q8JobLevel is 4 or above (Level 1 or above)': 2,
 'skip if q9CurrentRole is student or unemployed': 1}

In [32]:
# Collect it's names
columns_names = []
for k in codebook["Notes"].value_counts().to_dict():
    if 'right answer' in k.lower():
        columns_names.append(k)
columns_names

['right answer: 1 or C',
 'right answer: 1 or \'prints  "Hello World!" n times\'',
 'right answer: 1 or num%2 == 0',
 'right answer: 2 or Queue']

In [59]:
# Map each column's name with data field
import re # used to delete double spaces in right answer

data_field_cols = {}
for name in columns_names:
    val = list(codebook[codebook["Notes"] == name]["Data Field"])[0]
    data_field_cols[val] = name
    
print(data_field_cols)

print("\n\n")

# Convert each sentence in list of right answers
for k in data_field_cols:
    splitted_by_right_ans = data_field_cols[k].split("right answer: ")[1]
    splitted_by_or = splitted_by_right_ans.split("or", 1)
    splitted_by_or = [re.sub(' +', ' ', i.strip().replace("\'", "")) for i in splitted_by_or]
    data_field_cols[k] = splitted_by_or
    
print(data_field_cols)

{'q15Level2': 'right answer: 1 or \'prints  "Hello World!" n times\'', 'q31Level3': 'right answer: 1 or num%2 == 0', 'q7Level1': 'right answer: 1 or C', 'q36Level4': 'right answer: 2 or Queue'}



{'q15Level2': ['1', 'prints "Hello World!" n times'], 'q31Level3': ['1', 'num%2 == 0'], 'q7Level1': ['1', 'C'], 'q36Level4': ['2', 'Queue']}


In [51]:
# Get all people from Russia with their answers
rus_ans = values[values["CountryNumeric2"] == "Russian Federation"][[k for k in data_field_cols]]
rus_ans_len = rus_ans.shape[0]
rus_ans[:15]

Unnamed: 0,q15Level2,q31Level3,q7Level1,q36Level4
105,"prints ""Hello, World!"" n times",num%2 == 0,C,Queue
213,"prints ""Hello, World!"" n times",num%2 == 0,C,Hashmap
214,"prints ""Hello, World!"" n times",num%2 == 0,C,Hashmap
215,"prints ""Hello, World!"" n times",num%2 == 0,C,Queue
216,"prints ""Hello, World!"" n times",num%2 == 0,C,Queue
555,"prints ""Hello, World!"" n times",num%2 == 0,C,Queue
595,"prints ""Hello, World!"" n times",num%2 == 0,C,Set
808,"prints ""Hello, World!"" n times",num%2 == 0,C,Queue
812,"prints ""Hello, World!"" n times",num%2 == 0,C,Queue
905,"prints ""Hello, World!"" n times",num%2 == 0,C,Queue


In [63]:
# Average only correct answers variable
average = 0

for index, row in rus_ans.iterrows():
    to_add = True
    for i in range(len(rus_ans.columns)):
        if row[rus_ans.columns[i]].replace(",", "").strip() not in right_answers[i]:
            to_add = False
            break
    
    if to_add:
        average += 1
        
print("Answer: {} %".format(average / len(rus_ans) * 100))

Answer: 77.5132275132275 %
