Python Data Science Tutorial: Analyzing the 2019 Stack Overflow Developer Survey

[Data Source - Stack Overflow](https://insights.stackoverflow.com/survey)

[Tutorial](https://www.youtube.com/watch?v=_P7X8tMplsw)

In [1]:
import csv

In [14]:
# .DictReader will load data as ordered dictionary

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
# This behaves like a generator, not a list. Can't access with indexing.

# Print out first survey response
    for line in csv_reader:
        print(line)
        break

OrderedDict([('Respondent', '1'), ('MainBranch', 'I am a student who is learning to code'), ('Hobbyist', 'Yes'), ('OpenSourcer', 'Never'), ('OpenSource', 'The quality of OSS and closed source software is about the same'), ('Employment', 'Not employed, and not looking for work'), ('Country', 'United Kingdom'), ('Student', 'No'), ('EdLevel', 'Primary/elementary school'), ('UndergradMajor', 'NA'), ('EduOther', 'Taught yourself a new language, framework, or tool without taking a formal course'), ('OrgSize', 'NA'), ('DevType', 'NA'), ('YearsCode', '4'), ('Age1stCode', '10'), ('YearsCodePro', 'NA'), ('CareerSat', 'NA'), ('JobSat', 'NA'), ('MgrIdiot', 'NA'), ('MgrMoney', 'NA'), ('MgrWant', 'NA'), ('JobSeek', 'NA'), ('LastHireDate', 'NA'), ('LastInt', 'NA'), ('FizzBuzz', 'NA'), ('JobFactors', 'NA'), ('ResumeUpdate', 'NA'), ('CurrencySymbol', 'NA'), ('CurrencyDesc', 'NA'), ('CompTotal', 'NA'), ('CompFreq', 'NA'), ('ConvertedComp', 'NA'), ('WorkWeekHrs', 'NA'), ('WorkPlan', 'NA'), ('WorkChalleng

In [6]:
csv_reader

<csv.DictReader at 0x10c841f98>

### Access response to Do You Code As A Hobbyist? 

Column is named Hobbyist.

In [17]:
# .DictReader will load data as ordered dictionary

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
# This behaves like a generator, not a list. Can't access with indexing.

# Print out first survey response
    for line in csv_reader:
        print(line['Hobbyist'])
        break

Yes


# Analysis 1 - How many responders answered YES v NO to being a Hobbyist?

### Solution 1

In [19]:
with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    yes_count = 0
    no_count = 0
    
    for line in csv_reader:
        if line["Hobbyist"] == 'Yes':
           yes_count += 1
        elif line['Hobbyist'] == 'No':
            no_count += 1

print(f'Yes: {yes_count}')
print(f'No: {no_count}')

Yes: 71257
No: 17626


### Solution 1, continue - Change to Percentages

In [23]:
with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    yes_count = 0
    no_count = 0
    
    for line in csv_reader:
        if line["Hobbyist"] == 'Yes':
           yes_count += 1
        elif line['Hobbyist'] == 'No':
            no_count += 1

total_count = yes_count + no_count

yes_pct = (yes_count / total_count) * 100
yes_pct = round(yes_pct, 2)

no_pct = (no_count / total_count) * 100
no_pct = round(no_pct, 2)

print(f'Yes: {yes_pct}%')
print(f'No: {no_pct}%')

Yes: 80.17%
No: 19.83%


### Solution 2 - Use a Dictionary

In [27]:
with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    counts = {
        'Yes': 0,
        'No': 0
    }
    
    for line in csv_reader:
        counts[line['Hobbyist']] += 1
       

total = counts['Yes'] + counts['No']

yes_pct = (counts['Yes'] / total_count) * 100
yes_pct = round(yes_pct, 2)

no_pct = (counts['No'] / total_count) * 100
no_pct = round(no_pct, 2)

print(f'Yes: {yes_pct}%')
print(f'No: {no_pct}%')

Yes: 80.17%
No: 19.83%


### Solution 3 - Use a Default Dictionary so you won't need to initialize a dictionary yourself. 

No more counts = {}

In [28]:
import csv
from collections import defaultdict

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    # Intializes with Integer as Values
    counts = defaultdict(int)
    
    for line in csv_reader:
        counts[line['Hobbyist']] += 1
       

total = counts['Yes'] + counts['No']

yes_pct = (counts['Yes'] / total_count) * 100
yes_pct = round(yes_pct, 2)

no_pct = (counts['No'] / total_count) * 100
no_pct = round(no_pct, 2)

print(f'Yes: {yes_pct}%')
print(f'No: {no_pct}%')

Yes: 80.17%
No: 19.83%


### Solution 4 - Use a Counter module

Counter module allows many methods, like display Top 5 values

In [29]:
import csv
from collections import defaultdict, Counter

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    # Intializes with Integer as Values
    counts = Counter()
    
    for line in csv_reader:
        counts[line['Hobbyist']] += 1
       

total = counts['Yes'] + counts['No']

yes_pct = (counts['Yes'] / total_count) * 100
yes_pct = round(yes_pct, 2)

no_pct = (counts['No'] / total_count) * 100
no_pct = round(no_pct, 2)

print(f'Yes: {yes_pct}%')
print(f'No: {no_pct}%')

Yes: 80.17%
No: 19.83%


# Analysis #2 - Which language do you use?

Column Used - LanguageWorkedWith

In [35]:
import csv
from collections import defaultdict, Counter

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    # Intializes with Integer as Values
    language_counter = Counter()
    
    for line in csv_reader:
        print(line['LanguageWorkedWith'])
        break

HTML/CSS;Java;JavaScript;Python


**Split the output by Semicolon**

In [37]:
import csv
from collections import defaultdict, Counter

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    language_counter = Counter()
    
    for line in csv_reader:
        languages = line['LanguageWorkedWith'].split(';')
        print(languages)
        break

['HTML/CSS', 'Java', 'JavaScript', 'Python']


In [45]:
import csv
from collections import defaultdict, Counter

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    language_counter = Counter()
    
    for line in csv_reader:
        languages = line['LanguageWorkedWith'].split(';')
        
        for language in languages: 
            language_counter[language] += 1
        
        print(language_counter)
        break

Counter({'HTML/CSS': 1, 'Java': 1, 'JavaScript': 1, 'Python': 1})


**Use Counter method to update count**

In [49]:
import csv
from collections import defaultdict, Counter

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    language_counter = Counter()
    
    for line in csv_reader:
        languages = line['LanguageWorkedWith'].split(';')
        
        language_counter.update(languages)
        
        print(language_counter)
        break

Counter({'HTML/CSS': 1, 'Java': 1, 'JavaScript': 1, 'Python': 1})


In [50]:
import csv
from collections import defaultdict, Counter

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    language_counter = Counter()
    
    for line in csv_reader:
        languages = line['LanguageWorkedWith'].split(';')
        
        language_counter.update(languages)
        
print(language_counter)
        

Counter({'JavaScript': 59219, 'HTML/CSS': 55466, 'SQL': 47544, 'Python': 36443, 'Java': 35917, 'Bash/Shell/PowerShell': 31991, 'C#': 27097, 'PHP': 23030, 'C++': 20524, 'TypeScript': 18523, 'C': 18017, 'Other(s):': 7920, 'Ruby': 7331, 'Go': 7201, 'Assembly': 5833, 'Swift': 5744, 'Kotlin': 5620, 'R': 5048, 'VBA': 4781, 'Objective-C': 4191, 'Scala': 3309, 'Rust': 2794, 'Dart': 1683, 'NA': 1314, 'Elixir': 1260, 'Clojure': 1254, 'WebAssembly': 1015, 'F#': 973, 'Erlang': 777})


In [51]:
import csv
from collections import defaultdict, Counter
import pprint

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    language_counter = Counter()
    
    for line in csv_reader:
        languages = line['LanguageWorkedWith'].split(';')
        
        language_counter.update(languages)
        
pprint.pprint(language_counter)       

Counter({'JavaScript': 59219,
         'HTML/CSS': 55466,
         'SQL': 47544,
         'Python': 36443,
         'Java': 35917,
         'Bash/Shell/PowerShell': 31991,
         'C#': 27097,
         'PHP': 23030,
         'C++': 20524,
         'TypeScript': 18523,
         'C': 18017,
         'Other(s):': 7920,
         'Ruby': 7331,
         'Go': 7201,
         'Assembly': 5833,
         'Swift': 5744,
         'Kotlin': 5620,
         'R': 5048,
         'VBA': 4781,
         'Objective-C': 4191,
         'Scala': 3309,
         'Rust': 2794,
         'Dart': 1683,
         'NA': 1314,
         'Elixir': 1260,
         'Clojure': 1254,
         'WebAssembly': 1015,
         'F#': 973,
         'Erlang': 777})


**Print Top 10 Languages using Counter**

In [53]:
import csv
from collections import defaultdict, Counter
import pprint

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    
    language_counter = Counter()
    
    for line in csv_reader:
        languages = line['LanguageWorkedWith'].split(';')
        
        language_counter.update(languages)
        
pprint.pprint(language_counter.most_common(10))       

[('JavaScript', 59219),
 ('HTML/CSS', 55466),
 ('SQL', 47544),
 ('Python', 36443),
 ('Java', 35917),
 ('Bash/Shell/PowerShell', 31991),
 ('C#', 27097),
 ('PHP', 23030),
 ('C++', 20524),
 ('TypeScript', 18523)]


**Print Percentages of Language Used**

In [54]:
import csv
from collections import defaultdict, Counter
import pprint

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
    total = 0
    
    language_counter = Counter()
    
    for line in csv_reader:
        languages = line['LanguageWorkedWith'].split(';')
        
        language_counter.update(languages)
        
        total += 1
        
for language, value in language_counter.most_common(5):
    lang_pct = (value / total_count) * 100
    lang_pct = round(lang_pct, 2)
    
    print(f'{language}: {lang_pct}%')

JavaScript: 66.63%
HTML/CSS: 62.4%
SQL: 53.49%
Python: 41.0%
Java: 40.41%


# Analysis #3 - Breakdown a list with Roles and Lanaguages Used

In [60]:
import csv
from collections import defaultdict, Counter
import pprint

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
   
    dev_type_info = {}
    
    for line in csv_reader:
        dev_types = line['DevType'].split(';')
        
        for types in dev_types:
            dev_type_info[types] = {}

for key in dev_type_info:
    print(key)

NA
Developer, desktop or enterprise applications
Developer, front-end
Designer
Developer, back-end
Developer, full-stack
Academic researcher
Developer, mobile
Data or business analyst
Data scientist or machine learning specialist
Database administrator
Engineer, data
Engineer, site reliability
Developer, QA or test
DevOps specialist
Developer, game or graphics
Educator
Student
Engineering manager
Senior executive/VP
System administrator
Developer, embedded applications or devices
Product manager
Scientist
Marketing or sales professional


[Youtube - Lines 15-18](https://youtu.be/_P7X8tMplsw?t=2479)

In [62]:
import csv
from collections import defaultdict, Counter
import pprint

with open('survey_results_public.csv') as f:
    csv_reader = csv.DictReader(f)
   
    dev_type_info = {}
    
    for line in csv_reader:
        dev_types = line['DevType'].split(';')
        
        # A little unclear on this portion.
        for types in dev_types:
            dev_type_info.setdefault(types, {
                'total': 0,
                'language_counter': Counter()
            })
            
            languages = line['LanguageWorkedWith'].split(';')
            dev_type_info[types]['language_counter'].update(languages)
            dev_type_info[types]['total'] += 1

for types, info in dev_type_info.items():
    print(types)
    
    for language, value in info['language_counter'].most_common(5):
        lang_pct = (value / info['total']) * 100
        lang_pct = round(lang_pct, 2)

        print(f'\t{language}: {lang_pct}%')

NA
	HTML/CSS: 54.9%
	Python: 51.09%
	JavaScript: 50.58%
	Java: 42.71%
	C++: 35.02%
Developer, desktop or enterprise applications
	JavaScript: 67.84%
	HTML/CSS: 64.55%
	SQL: 63.56%
	C#: 53.69%
	Java: 44.69%
Developer, front-end
	JavaScript: 87.72%
	HTML/CSS: 83.62%
	SQL: 58.65%
	Java: 37.6%
	PHP: 35.94%
Designer
	HTML/CSS: 78.88%
	JavaScript: 78.33%
	SQL: 60.18%
	PHP: 40.23%
	Java: 39.44%
Developer, back-end
	JavaScript: 72.23%
	HTML/CSS: 65.42%
	SQL: 64.01%
	Java: 44.03%
	Python: 40.67%
Developer, full-stack
	JavaScript: 86.15%
	HTML/CSS: 78.94%
	SQL: 65.54%
	Java: 40.74%
	Bash/Shell/PowerShell: 37.91%
Academic researcher
	Python: 61.06%
	HTML/CSS: 55.87%
	JavaScript: 54.25%
	SQL: 47.55%
	Java: 42.26%
Developer, mobile
	JavaScript: 67.72%
	HTML/CSS: 62.46%
	Java: 57.21%
	SQL: 51.27%
	C#: 34.34%
Data or business analyst
	SQL: 73.88%
	HTML/CSS: 62.11%
	JavaScript: 61.33%
	Python: 51.86%
	Bash/Shell/PowerShell: 38.43%
Data scientist or machine learning specialist
	Python: 79.33%
	SQL: 58.