# Limpeza de dados

In [40]:
import numpy as np
import pandas as pd
import re

In [41]:
df = pd.read_csv('dataset/survey_results_public.csv', sep=',', encoding='utf-8')

In [42]:
df.head()

Unnamed: 0,ResponseId,MainBranch,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,YearsCode,...,TimeSearching,TimeAnswering,Onboarding,ProfessionalTech,TrueFalse_1,TrueFalse_2,TrueFalse_3,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,None of these,,,,,,,,,...,,,,,,,,,,
1,2,I am a developer by profession,"Employed, full-time",Fully remote,Hobby;Contribute to open-source projects,,,,,,...,,,,,,,,Too long,Difficult,
2,3,"I am not primarily a developer, but I write co...","Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Friend or family member...,Technical documentation;Blogs;Programming Game...,,14.0,...,,,,,,,,Appropriate in length,Neither easy nor difficult,40205.0
3,4,I am a developer by profession,"Employed, full-time",Fully remote,I don’t code outside of work,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Books / Physical media;School (i.e., Universit...",,,20.0,...,,,,,,,,Appropriate in length,Easy,215232.0
4,5,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Stack Overflow;O...,,8.0,...,,,,,,,,Too long,Easy,


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73268 entries, 0 to 73267
Data columns (total 79 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ResponseId                      73268 non-null  int64  
 1   MainBranch                      73268 non-null  object 
 2   Employment                      71709 non-null  object 
 3   RemoteWork                      58958 non-null  object 
 4   CodingActivities                58899 non-null  object 
 5   EdLevel                         71571 non-null  object 
 6   LearnCode                       71580 non-null  object 
 7   LearnCodeOnline                 50685 non-null  object 
 8   LearnCodeCoursesCert            29389 non-null  object 
 9   YearsCode                       71331 non-null  object 
 10  YearsCodePro                    51833 non-null  object 
 11  DevType                         61302 non-null  object 
 12  OrgSize                         

Sabemos pelo schema que o ResponseId é exatamente o que o nome dá a entender. Então o usaremos como index do DataFrame.

In [44]:
df.set_index('ResponseId')
print()




## Primeira seção

Questões 1 a 18

Então avançamos para a próxima coluna: MainBranch. Ela demostra de que forma o respondente se relaciona com desenvolvimento. Vamos aproveitar para substituir os valores por termos mais simples. 

In [45]:
print(df['MainBranch'].unique())

['None of these' 'I am a developer by profession'
 'I am not primarily a developer, but I write code sometimes as part of my work'
 'I code primarily as a hobby' 'I am learning to code'
 'I used to be a developer by profession, but no longer am']


In [46]:
df['MainBranch'] = df['MainBranch'].replace({
  'None of these': 'none',
  'I am a developer by profession': 'dev',
  'I am not primarily a developer, but I write code sometimes as part of my work': 'part_dev',
  'I code primarily as a hobby': 'hobbist',
  'I am learning to code': 'learner',
  'I used to be a developer by profession, but no longer am': 'ex_dev',
})
print(df['MainBranch'].unique())

['none' 'dev' 'part_dev' 'hobbist' 'learner' 'ex_dev']


Partindo para a próxima coluna (Employment), podemos notar pela forma que ela está formatada que era uma questão com múltiplas seleções. Para facilitar o gerenciamento futuro, optamos por reduzir os nomes.

In [47]:
print(df['Employment'].unique())

[nan 'Employed, full-time' 'Student, full-time' 'Student, part-time'
 'Not employed, but looking for work'
 'Independent contractor, freelancer, or self-employed'
 'Employed, full-time;Independent contractor, freelancer, or self-employed'
 'Employed, part-time' 'Student, part-time;Employed, part-time'
 'Not employed, and not looking for work'
 'Student, full-time;Employed, part-time'
 'Employed, full-time;Student, part-time'
 'Employed, full-time;Student, full-time'
 'Student, part-time;Independent contractor, freelancer, or self-employed'
 'Retired' 'Student, full-time;Not employed, but looking for work'
 'I prefer not to say'
 'Student, full-time;Independent contractor, freelancer, or self-employed'
 'Student, full-time;Not employed, and not looking for work'
 'Not employed, but looking for work;Independent contractor, freelancer, or self-employed'
 'Employed, full-time;Student, part-time;Independent contractor, freelancer, or self-employed'
 'Independent contractor, freelancer, or s

In [48]:
df['Employment'] = df['Employment'].str.replace('Independent contractor, freelancer, or self-employed', 'independent', regex=True)
df['Employment'] = df['Employment'].str.replace('Not employed, but looking for work', 'unemployed_looking', regex=True)
df['Employment'] = df['Employment'].str.replace('Not employed, and not looking for work', 'unemployed_not_looking', regex=True)
df['Employment'] = df['Employment'].str.replace('I prefer not to say', 'opt_out', regex=True)
df['Employment'] = df['Employment'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [49]:
df[df['Employment'].isnull()]

Unnamed: 0,ResponseId,MainBranch,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,YearsCode,...,TimeSearching,TimeAnswering,Onboarding,ProfessionalTech,TrueFalse_1,TrueFalse_2,TrueFalse_3,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,none,,,,,,,,,...,,,,,,,,,,
30,31,none,,,,,,,,,...,,,,,,,,,,
86,87,none,,,,,,,,,...,,,,,,,,,,
98,99,dev,,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",,,,,...,,,,,,,,,,
136,137,hobbist,,,,Some college/university study without earning ...,Friend or family member;Other online resources...,Technical documentation;Blogs;Written Tutorial...,,10,...,,,,,,,,Appropriate in length,Easy,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73109,73110,none,,,,,,,,,...,,,,,,,,,,
73157,73158,none,,,,,,,,,...,,,,,,,,,,
73201,73202,none,,,,,,,,,...,,,,,,,,,,
73215,73216,none,,,,,,,,,...,,,,,,,,,,


Com a impressão acima, nos chamou atenção como algumas linhas possuem muitas respostas vazias. Guardaremos essa informação para analisar depois das colunas.

In [50]:
df['RemoteWork'].unique()

array([nan, 'Fully remote', 'Hybrid (some remote, some in-person)',
       'Full in-person'], dtype=object)

In [51]:
df['RemoteWork'] = df['RemoteWork'].replace({
  'Fully remote': 'remote',
  'Hybrid (some remote, some in-person)': 'hybrid',
  'Full in-person': 'in_person'
})

In [52]:
df['CodingActivities'].unique()

array([nan, 'Hobby;Contribute to open-source projects', 'Hobby',
       'I don’t code outside of work',
       'Hobby;Contribute to open-source projects;Bootstrapping a business',
       'Hobby;Contribute to open-source projects;Freelance/contract work',
       'Hobby;Freelance/contract work', 'Hobby;Bootstrapping a business',
       'Other (please specify):', 'Contribute to open-source projects',
       'Hobby;Other (please specify):',
       'Hobby;Contribute to open-source projects;Bootstrapping a business;Freelance/contract work',
       'Bootstrapping a business', 'Freelance/contract work',
       'Hobby;Bootstrapping a business;Freelance/contract work',
       'Bootstrapping a business;Freelance/contract work',
       'Hobby;Contribute to open-source projects;Other (please specify):',
       'Contribute to open-source projects;Freelance/contract work',
       'Hobby;Freelance/contract work;Other (please specify):',
       'Contribute to open-source projects;Bootstrapping a busine

In [53]:
df['CodingActivities'] = df['CodingActivities'].str.replace('Contribute to open-source projects', 'open_source', regex=True)
df['CodingActivities'] = df['CodingActivities'].str.replace('Bootstrapping a business', 'business', regex=True)
df['CodingActivities'] = df['CodingActivities'].str.replace('School or academic work', 'school', regex=True)
df['CodingActivities'] = df['CodingActivities'].str.replace('Freelance/contract work', 'freelance', regex=True)
df['CodingActivities'] = df['CodingActivities'].str.replace('Other (please specify):', 'other', regex=True)
df['CodingActivities'] = df['CodingActivities'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [54]:
df['EdLevel'].unique()

array([nan, 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)',
       'Bachelor’s degree (B.A., B.S., B.Eng., etc.)',
       'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
       'Some college/university study without earning a degree',
       'Something else', 'Primary/elementary school',
       'Other doctoral degree (Ph.D., Ed.D., etc.)',
       'Associate degree (A.A., A.S., etc.)',
       'Professional degree (JD, MD, etc.)'], dtype=object)

In [55]:
df['EdLevel'] = df['EdLevel'].replace({
  "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)": 'master',
  "Bachelor’s degree (B.A., B.S., B.Eng., etc.)": 'bachelor',
  "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)": 'high_school',
  "Some college/university study without earning a degree": 'unfinished_college',
  "Something else": 'other',
  "Primary/elementary school": 'primary_school',
  "Other doctoral degree (Ph.D., Ed.D., etc.)": 'doctor',
  "Associate degree (A.A., A.S., etc.)": 'associate',
  "Professional degree (JD, MD, etc.)": 'professional'
})

In [56]:
df['LearnCode'].unique()

array([nan,
       'Books / Physical media;Friend or family member;Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc)',
       'Books / Physical media;School (i.e., University, College, etc)',
       'Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc);On the job training',
       'Other online resources (e.g., videos, blogs, forum)',
       'Online Courses or Certification',
       'On the job training;Coding Bootcamp',
       'Books / Physical media;Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc)',
       'School (i.e., University, College, etc)',
       'Books / Physical media',
       'Books / Physical media;Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc);Online Courses or Certification;Colleague',
       'Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc);On the job training

In [57]:
df['LearnCode'] = df['LearnCode'].str.replace('Books / Physical media', 'books', regex=True)
df['LearnCode'] = df['LearnCode'].str.replace('Friend or family member', 'someone', regex=True)
df['LearnCode'] = df['LearnCode'].str.replace('Other online resources (e.g., videos, blogs, forum)', 'online', regex=True)
df['LearnCode'] = df['LearnCode'].str.replace('School (i.e., University, College, etc)', 'school', regex=True)
df['LearnCode'] = df['LearnCode'].str.replace('On the job training', 'job', regex=True)
df['LearnCode'] = df['LearnCode'].str.replace('Online Courses or Certification', 'online_course', regex=True)
df['LearnCode'] = df['LearnCode'].str.replace('Coding Bootcamp', 'bootcamp', regex=True)
df['LearnCode'] = df['LearnCode'].str.replace('Hackathons (virtual or in-person)', 'hackathon', regex=True)
df['LearnCode'] = df['LearnCode'].str.replace('Other (please specify):', 'other_', regex=True)
df['LearnCode'] = df['LearnCode'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [58]:
df['LearnCodeOnline'].unique()

array([nan,
       'Technical documentation;Blogs;Programming Games;Written Tutorials;Stack Overflow',
       'Technical documentation;Blogs;Stack Overflow;Online books;Video-based Online Courses;Online challenges (e.g., daily or weekly coding challenges)',
       ...,
       'Written Tutorials;Online books;Video-based Online Courses;How-to videos;Written-based Online Courses;Coding sessions (live or recorded);Certification videos',
       'Programming Games;Stack Overflow;Video-based Online Courses;Online challenges (e.g., daily or weekly coding challenges);How-to videos;Written-based Online Courses;Interactive tutorial;Coding sessions (live or recorded);Certification videos',
       'Technical documentation;Programming Games;Stack Overflow;Online books;Video-based Online Courses;How-to videos;Written-based Online Courses;Coding sessions (live or recorded);Certification videos'],
      dtype=object)

In [59]:
df['LearnCodeOnline'] = df['LearnCodeOnline'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')
# remove os parênteses e tudo que tem dentro
df['LearnCodeOnline'] = df['LearnCodeOnline'].str.replace(r"\(.*\)","")

  df['LearnCodeOnline'] = df['LearnCodeOnline'].str.replace(r"\(.*\)","")


In [60]:
df['LearnCodeCoursesCert'].unique()

array([nan, 'Coursera;Udemy', 'Udemy;Codecademy', 'Coursera;Pluralsight',
       'Coursera;Udemy;Codecademy;edX;Udacity',
       'Coursera;Udemy;Pluralsight;edX', 'Udemy', 'Other',
       'Coursera;Udemy;Udacity', 'Udemy;Pluralsight',
       'Coursera;Udemy;Pluralsight', 'Codecademy', 'Coursera',
       'Coursera;Udemy;edX', 'Udemy;Other', 'Pluralsight',
       'Coursera;Udemy;Codecademy', 'Codecademy;Pluralsight',
       'Coursera;edX', 'Udemy;Codecademy;Pluralsight',
       'Pluralsight;Udacity', 'Coursera;Udemy;Other',
       'Codecademy;Pluralsight;Other',
       'Udemy;Codecademy;Pluralsight;Other', 'Udemy;Pluralsight;Udacity',
       'Coursera;Udemy;Codecademy;Udacity', 'Udemy;edX',
       'Coursera;Udemy;edX;Udacity',
       'Coursera;Pluralsight;edX;Udacity;Other', 'edX',
       'Coursera;Codecademy', 'Coursera;Other', 'Codecademy;Other',
       'Udemy;Codecademy;Pluralsight;edX;Udacity', 'Coursera;Udacity',
       'Udemy;Pluralsight;Other', 'Coursera;Codecademy;Pluralsight;edX

In [61]:
df['LearnCodeCoursesCert'] = df['LearnCodeCoursesCert'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [62]:
df['YearsCode'].unique()

array([nan, '14', '20', '8', '15', '3', '1', '6', '37', '5', '12', '22',
       '11', '4', '7', '13', '36', '2', '25', '10', '40', '16', '27',
       '24', '19', '9', '17', '18', '26', 'More than 50 years', '29',
       '30', '32', 'Less than 1 year', '48', '45', '38', '39', '28', '23',
       '43', '21', '41', '35', '50', '33', '31', '34', '46', '44', '42',
       '47', '49'], dtype=object)

In [63]:
df['YearsCodeNumber'] = pd.to_numeric(df['YearsCode'], errors = 'coerce').replace(np.nan, 0)
df['YearsCodeNumber'].unique()

array([ 0., 14., 20.,  8., 15.,  3.,  1.,  6., 37.,  5., 12., 22., 11.,
        4.,  7., 13., 36.,  2., 25., 10., 40., 16., 27., 24., 19.,  9.,
       17., 18., 26., 29., 30., 32., 48., 45., 38., 39., 28., 23., 43.,
       21., 41., 35., 50., 33., 31., 34., 46., 44., 42., 47., 49.])

In [64]:
df['YearsCodePro'].unique()

array([nan, '5', '17', '3', '6', '30', '2', '10', '15', '4', '22', '20',
       '40', '9', '14', '21', '7', '18', '25', '8', '12', '45', '1', '19',
       '28', '24', '11', '23', 'Less than 1 year', '32', '27', '16', '44',
       '26', '37', '46', '13', '31', '39', '34', '38', '35', '29', '42',
       '36', '33', '43', '41', '48', '50', 'More than 50 years', '47',
       '49'], dtype=object)

In [65]:
df['YearsCodeProNumber'] = pd.to_numeric(df['YearsCodePro'], errors = 'coerce').replace(np.nan, 0)
df['YearsCodeProNumber'].unique()

array([ 0.,  5., 17.,  3.,  6., 30.,  2., 10., 15.,  4., 22., 20., 40.,
        9., 14., 21.,  7., 18., 25.,  8., 12., 45.,  1., 19., 28., 24.,
       11., 23., 32., 27., 16., 44., 26., 37., 46., 13., 31., 39., 34.,
       38., 35., 29., 42., 36., 33., 43., 41., 48., 50., 47., 49.])

In [66]:
df['DevType'].unique()

array([nan,
       'Data scientist or machine learning specialist;Developer, front-end;Engineer, data;Engineer, site reliability',
       'Developer, full-stack', ...,
       'Data scientist or machine learning specialist;Developer, front-end;Developer, full-stack;Developer, back-end;Developer, QA or test;Developer, mobile;Database administrator;Cloud infrastructure engineer;Data or business analyst;Designer;Blockchain',
       'Developer, front-end;Developer, full-stack;Developer, back-end;Developer, desktop or enterprise applications;Developer, mobile;Educator;Developer, embedded applications or devices',
       'Developer, front-end;Engineer, data;Engineer, site reliability;Developer, full-stack;Developer, back-end;Developer, desktop or enterprise applications;Developer, QA or test;Student;Developer, mobile;Academic researcher;DevOps specialist;Developer, embedded applications or devices;Developer, game or graphics;Cloud infrastructure engineer;Data or business analyst;Designer;Scie

In [67]:
df['DevType'] = df['DevType'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [68]:
df['OrgSize'].unique()

array([nan, '20 to 99 employees', '100 to 499 employees', 'I don’t know',
       'Just me - I am a freelancer, sole proprietor, etc.',
       '2 to 9 employees', '5,000 to 9,999 employees',
       '1,000 to 4,999 employees', '10,000 or more employees',
       '500 to 999 employees', '10 to 19 employees'], dtype=object)

Possível análise a se fazer: 
Cruzar tamanho de empresa com salário

In [69]:
df['OrgSize'].unique()

array([nan, '20 to 99 employees', '100 to 499 employees', 'I don’t know',
       'Just me - I am a freelancer, sole proprietor, etc.',
       '2 to 9 employees', '5,000 to 9,999 employees',
       '1,000 to 4,999 employees', '10,000 or more employees',
       '500 to 999 employees', '10 to 19 employees'], dtype=object)

In [70]:
df['PurchaseInfluence'].unique()

array([nan, 'I have some influence', 'I have little or no influence',
       'I have a great deal of influence'], dtype=object)

In [71]:
df.drop('PurchaseInfluence', axis=1, inplace=True)

In [72]:
df['BuyNewTool'].unique()

array([nan, 'Other (please specify):',
       'Start a free trial;Visit developer communities like Stack Overflow',
       'Start a free trial',
       'Start a free trial;Ask developers I know/work with;Research companies that have advertised on sites I visit',
       'Start a free trial;Visit developer communities like Stack Overflow;Ask developers I know/work with',
       'Start a free trial;Ask developers I know/work with',
       'Visit developer communities like Stack Overflow',
       'Start a free trial;Research companies that have advertised on sites I visit',
       'Visit developer communities like Stack Overflow;Ask developers I know/work with',
       'Other (please specify):;Start a free trial;Research companies that have advertised on sites I visit',
       'Ask developers I know/work with',
       'Start a free trial;Visit developer communities like Stack Overflow;Read ratings or reviews on third party sites like G2Crowd',
       'Other (please specify):;Ask developers

In [73]:
df.drop('BuyNewTool', axis=1, inplace=True)

In [74]:
df['Country'].unique()

array([nan, 'Canada',
       'United Kingdom of Great Britain and Northern Ireland', 'Israel',
       'United States of America', 'Germany', 'India', 'Netherlands',
       'Croatia', 'Australia', 'Russian Federation', 'Czech Republic',
       'Austria', 'Serbia', 'Italy', 'Ireland', 'Poland', 'Slovenia',
       'Iraq', 'Sweden', 'Madagascar', 'Norway', 'Taiwan',
       'Hong Kong (S.A.R.)', 'Mexico', 'France', 'Brazil', 'Lithuania',
       'Uruguay', 'Denmark', 'Spain', 'Egypt', 'Turkey', 'South Africa',
       'Ukraine', 'Finland', 'Romania', 'Portugal', 'Singapore', 'Oman',
       'Belgium', 'Chile', 'Bulgaria', 'Latvia', 'Philippines', 'Greece',
       'Belarus', 'Saudi Arabia', 'Kenya', 'Switzerland', 'Iceland',
       'Viet Nam', 'Thailand', 'China', 'Montenegro', 'Slovakia', 'Japan',
       'Luxembourg', 'Turkmenistan', 'Argentina', 'Hungary', 'Tunisia',
       'Bangladesh', 'Maldives', 'Dominican Republic', 'Jordan',
       'Pakistan', 'Nepal', 'Iran, Islamic Republic of...', 'I

In [75]:
df['Country'] = df['Country'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')
# remove os parênteses e tudo que tem dentro
df['Country'] = df['Country'].str.replace(r"\(.*\)","")

  df['Country'] = df['Country'].str.replace(r"\(.*\)","")


In [76]:
df['Currency'].unique()

array([nan, 'CAD\tCanadian dollar', 'GBP\tPound sterling',
       'ILS\tIsraeli new shekel', 'USD\tUnited States dollar',
       'EUR European Euro', 'HRK\tCroatian kuna',
       'AUD\tAustralian dollar', 'RUB\tRussian ruble',
       'CZK\tCzech koruna', 'PLN\tPolish zloty', 'NOK\tNorwegian krone',
       'TWD\tNew Taiwan dollar', 'HKD\tHong Kong dollar',
       'INR\tIndian rupee', 'BRL\tBrazilian real', 'UYU\tUruguayan peso',
       'RSD\tSerbian dinar', 'SEK\tSwedish krona', 'DKK\tDanish krone',
       'EGP\tEgyptian pound', 'TRY\tTurkish lira', 'RON\tRomanian leu',
       'SGD\tSingapore dollar', 'SAR\tSaudi Arabian riyal',
       'BGN\tBulgarian lev', 'BYN\tBelarusian ruble',
       'KES\tKenyan shilling', 'CHF\tSwiss franc',
       'ZAR\tSouth African rand', 'ISK\tIcelandic krona',
       'THB\tThai baht', 'CNY\tChinese Yuan Renminbi',
       'VND\tVietnamese dong', 'ARS\tArgentine peso',
       'HUF\tHungarian forint', 'TND\tTunisian dinar',
       'BDT\tBangladeshi taka', 'MVR\

In [77]:
# limpa a moeda para ficar apenas o código de 3 letras dela
df['Currency'] = df['Currency'].str.replace(r"[(\t)\s].*","")

  df['Currency'] = df['Currency'].str.replace(r"[(\t)\s].*","")


In [78]:
df['CompTotal'].unique()

array([     nan,   32000.,   60000., ..., 1038000.,   64200.,  439000.])

In [79]:
df.rename(columns={'CompTotal': 'GrossWage', 'CompFreq': 'WageFreq'}, inplace=True)

In [80]:
df['WageFreq'].unique()

array([nan, 'Yearly', 'Monthly', 'Weekly'], dtype=object)

**Prestar atenção no campo a seguir (renomeado para YearlyWageInDollar)**. Ele já traz os valores de salário convertidos na mesma moeda e mesma frequência. Talvez seja interessante tirar dados sobre os salários em diferentes frequências se tem alguma correlação.

In [81]:
df.rename(columns={'ConvertedCompYearly': 'YearlyWageInDollar'}, inplace=True)

Aqui termina a limpeza da primeira seção do formulário

-----

## Segunda seção

Questões 19 a 32

In [82]:
df['LanguageHaveWorkedWith'].unique()

array([nan, 'JavaScript;TypeScript', 'C#;C++;HTML/CSS;JavaScript;Python',
       ..., 'Assembly;C;C++;Java;TypeScript',
       'Bash/Shell;Dart;JavaScript;PHP;Python;SQL;TypeScript',
       'C#;JavaScript;Lua;PowerShell;SQL;TypeScript'], dtype=object)

In [83]:
df['DatabaseHaveWorkedWith'].unique()

array([nan, 'Microsoft SQL Server',
       'Cloud Firestore;Elasticsearch;Microsoft SQL Server;Firebase Realtime Database',
       ...,
       'Cassandra;Cloud Firestore;Elasticsearch;MongoDB;PostgreSQL;SQLite',
       'Elasticsearch;MongoDB;Oracle;SQLite',
       'Microsoft SQL Server;Neo4j;Redis'], dtype=object)

In [84]:
df['PlatformHaveWorkedWith'].unique()

array([nan, 'Firebase;Microsoft Azure', 'AWS;Google Cloud;Heroku', ...,
       'AWS;DigitalOcean;Google Cloud;Heroku;OpenStack',
       'IBM Cloud or Watson;Linode;VMware',
       'Colocation;DigitalOcean;Heroku;Linode;Oracle Cloud Infrastructure;OVH'],
      dtype=object)

In [85]:
df['WebframeHaveWorkedWith'].unique()

array([nan, 'Angular.js', 'ASP.NET;ASP.NET Core ', ...,
       'Angular;ASP.NET;ASP.NET Core ;Blazor;Django;Drupal;Express;Flask;jQuery;Laravel',
       'Express;FastAPI;Flask;Laravel;Next.js;Node.js;React.js;Symfony;Vue.js',
       'Angular.js;Express;jQuery;Next.js;Node.js;Play Framework;React.js'],
      dtype=object)

In [86]:
df['MiscTechHaveWorkedWith'].unique()

array([nan, 'Pandas', '.NET', ...,
       'Apache Spark;Hadoop;Keras;NumPy;Pandas',
       '.NET;Apache Kafka;NumPy;Spring',
       'Apache Kafka;Flutter;Hadoop;Keras;NumPy;Pandas;Scikit-learn;TensorFlow;Torch/PyTorch'],
      dtype=object)

In [87]:
df['ToolsTechHaveWorkedWith'].unique()

array([nan, 'npm', 'Homebrew', 'Homebrew;npm', 'Docker;npm;Terraform',
       'Docker;Homebrew;Kubernetes', 'Docker', 'Docker;Homebrew;npm;Yarn',
       'Docker;npm;Yarn', 'Docker;npm', 'Docker;Kubernetes',
       'Docker;Kubernetes;npm;Terraform;Yarn', 'Kubernetes;npm;Terraform',
       'Docker;Kubernetes;npm', 'Ansible;Docker;npm;Terraform',
       'Ansible;Docker;Terraform', 'npm;Yarn',
       'Docker;Homebrew;Kubernetes;npm;Yarn', 'Ansible;Docker;npm',
       'Docker;Homebrew;Unity 3D',
       'Ansible;Docker;Homebrew;Kubernetes;npm;Puppet;Terraform',
       'Ansible;Docker;Homebrew;Kubernetes;Terraform',
       'Docker;Flow;Kubernetes;npm;Unity 3D', 'Docker;Homebrew',
       'Ansible;Docker;Kubernetes;npm;Terraform;Unity 3D;Unreal Engine;Yarn',
       'Docker;Puppet', 'Docker;Homebrew;npm', 'Unity 3D',
       'Docker;Kubernetes;Puppet', 'Docker;Kubernetes;npm;Yarn',
       'Docker;npm;Terraform;Yarn', 'Docker;Yarn',
       'Ansible;Docker;Kubernetes', 'Docker;Terraform',
       'A

In [88]:
df['NEWCollabToolsHaveWorkedWith'].unique()

array([nan, 'Notepad++;Visual Studio',
       'Notepad++;Visual Studio;Visual Studio Code', ...,
       'Android Studio;Atom;Eclipse;GoLand;IntelliJ;Nano;Notepad++;PyCharm;Sublime Text;Visual Studio Code;Webstorm',
       'CLion;IntelliJ;Notepad++;Rider;Vim;Visual Studio;Visual Studio Code',
       'Eclipse;GoLand;Vim;Visual Studio Code'], dtype=object)

In [89]:
df.rename(columns={'OpSysProfessional use': 'OpSysProfessionalUse', 'OpSysPersonal use': 'OpSysPersonalUse'}, inplace=True)
df['OpSysProfessionalUse'].unique()

array([nan, 'macOS', 'Windows', 'Linux-based;macOS',
       'Windows;Windows Subsystem for Linux (WSL)',
       'Linux-based;macOS;Windows', 'Linux-based;Windows',
       'Windows Subsystem for Linux (WSL)', 'macOS;Windows',
       'Linux-based', 'Linux-based;Windows Subsystem for Linux (WSL)',
       'Linux-based;Windows;Windows Subsystem for Linux (WSL)',
       'Linux-based;macOS;Windows;Windows Subsystem for Linux (WSL)',
       'Other (please specify):',
       'BSD;Linux-based;Windows;Windows Subsystem for Linux (WSL)',
       'macOS;Other (please specify):',
       'macOS;Windows;Windows Subsystem for Linux (WSL)',
       'macOS;Windows Subsystem for Linux (WSL)',
       'Windows;Other (please specify):', 'BSD;Linux-based;macOS;Windows',
       'BSD;Linux-based',
       'BSD;Linux-based;Windows;Other (please specify):',
       'Linux-based;macOS;Windows Subsystem for Linux (WSL)',
       'BSD;Linux-based;Windows Subsystem for Linux (WSL)',
       'BSD;Linux-based;macOS',
       

In [90]:
df['VersionControlSystem'].unique()

array([nan, 'Git', 'Git;Other (please specify):', 'Mercurial;SVN',
       "I don't use one", 'Git;SVN', 'SVN', 'Other (please specify):',
       'Git;Mercurial', 'Git;Other (please specify):;SVN', 'Mercurial',
       'Git;Other (please specify):;Mercurial', 'Git;Mercurial;SVN',
       'Other (please specify):;SVN',
       'Git;Other (please specify):;Mercurial;SVN',
       'Other (please specify):;Mercurial',
       'Other (please specify):;Mercurial;SVN'], dtype=object)

In [91]:
df['VCInteraction'].unique()

array([nan, 'Code editor',
       'Code editor;Command-line;Version control hosting service web GUI;Dedicated version control GUI application',
       'Command-line;Version control hosting service web GUI;Dedicated version control GUI application',
       'Code editor;Command-line', 'Command-line',
       'Command-line;Dedicated version control GUI application',
       'Version control hosting service web GUI;Dedicated version control GUI application',
       'Code editor;Dedicated version control GUI application',
       'Code editor;Command-line;Dedicated version control GUI application',
       'Command-line;Version control hosting service web GUI',
       'Code editor;Command-line;Version control hosting service web GUI',
       'Code editor;Version control hosting service web GUI',
       'Version control hosting service web GUI',
       'Dedicated version control GUI application',
       'Code editor;Version control hosting service web GUI;Dedicated version control GUI applicatio

In [92]:
df.rename(columns={'VCHostingPersonal use': 'VCHostingPersonalUse', 'VCHostingProfessional use': 'VCHostingProfessionalUse'}, inplace=True)
df['VCHostingProfessionalUse'].unique()

array([nan])

In [93]:
df['VCHostingPersonalUse'].unique()

array([nan])

In [94]:
df.drop('VCHostingPersonalUse', axis=1, inplace=True)
df.drop('VCHostingProfessionalUse', axis=1, inplace=True)

In [95]:
df['OfficeStackAsyncHaveWorkedWith'].unique()

array([nan, 'Jira Work Management;Trello', 'Confluence', ...,
       'ClickUp;Confluence;Jira Work Management;Microsoft Planner;Microsoft Lists;monday.com',
       'Confluence;Jira Work Management;Microsoft Planner;Microsoft Lists;Wrike',
       'Airtable;Microsoft Planner;Microsoft Lists;Trello'], dtype=object)

In [96]:
df['OfficeStackSyncHaveWorkedWith'].unique()

array([nan, 'Microsoft Teams', 'Slack;Zoom', 'Microsoft Teams;Zoom',
       'Rocketchat;Slack;Zoom', 'Google Chat;Microsoft Teams;Slack;Zoom',
       'Google Chat;Microsoft Teams;Zoom', 'Google Chat;Slack',
       'Cisco Webex Teams;Google Chat;Microsoft Teams;RingCentral;Slack;Zoom',
       'Microsoft Teams;Slack;Zoom', 'Microsoft Teams;Slack',
       'Google Chat;Microsoft Teams;Slack', 'Zoom',
       'Google Chat;Slack;Zoom', 'Slack', 'Cisco Webex Teams;Zoom',
       'Cisco Webex Teams;Microsoft Teams;Zoom',
       'Cisco Webex Teams;Microsoft Teams;Slack;Zoom',
       'Microsoft Teams;Rocketchat;Slack;Wire;Zoom',
       'Cisco Webex Teams;Microsoft Teams;Slack', 'Google Chat',
       'Cisco Webex Teams;Google Chat;Microsoft Teams;Slack;Zoom',
       'Cisco Webex Teams', 'Google Chat;Microsoft Teams', 'Mattermost',
       'Mattermost;Slack', 'Cisco Webex Teams;Microsoft Teams',
       'Microsoft Teams;Symphony', 'Mattermost;Microsoft Teams;Zoom',
       'Cisco Webex Teams;Slack;Zoom

In [97]:
df['Blockchain'].unique()

array([nan, 'Very unfavorable', 'Unfavorable', 'Favorable',
       'Very favorable', 'Indifferent', 'Unsure'], dtype=object)

Aqui termina a limpeza da segunda seção do formulário

-----

## Terceira seção

Questões 33 a 37

In [98]:
df.drop('NEWSOSites', axis=1, inplace=True)

In [99]:
df['SOVisitFreq'].unique()

array([nan, 'Daily or almost daily', 'Multiple times per day',
       'A few times per week', 'A few times per month or weekly',
       'Less than once per month or monthly'], dtype=object)

In [100]:
df['SOVisitFreq'] = df['SOVisitFreq'].replace({
  "Daily or almost daily": 'daily',
  "Multiple times per day": 'multiple_day',
  "A few times per week": 'multiple_week',
  "A few times per month or weekly": 'weekly',
  "Less than once per month or monthly": 'monthly'
})

In [101]:
df.drop('SOAccount', axis=1, inplace=True)
df.drop('SOPartFreq', axis=1, inplace=True)
df.drop('SOComm', axis=1, inplace=True)

Aqui termina a limpeza da terceira seção do formulário

-----

## Quarta seção

Questões 38 a 44

In [102]:
df['Age'].unique()

array([nan, '25-34 years old', '35-44 years old', 'Under 18 years old',
       '18-24 years old', '45-54 years old', '55-64 years old',
       '65 years or older', 'Prefer not to say'], dtype=object)

In [103]:
df['Age'] = df['Age'].replace({
  'Prefer not to say': 'opt_out'
})

In [104]:
df['Gender'].unique()

array([nan, 'Man', 'Or, in your own words:', 'Woman',
       'Non-binary, genderqueer, or gender non-conforming',
       'Prefer not to say',
       'Man;Non-binary, genderqueer, or gender non-conforming',
       'Or, in your own words:;Non-binary, genderqueer, or gender non-conforming',
       'Woman;Non-binary, genderqueer, or gender non-conforming',
       'Man;Woman', 'Man;Or, in your own words:',
       'Or, in your own words:;Woman;Non-binary, genderqueer, or gender non-conforming',
       'Man;Woman;Non-binary, genderqueer, or gender non-conforming',
       'Or, in your own words:;Woman',
       'Man;Or, in your own words:;Woman;Non-binary, genderqueer, or gender non-conforming',
       'Man;Or, in your own words:;Non-binary, genderqueer, or gender non-conforming',
       'Man;Or, in your own words:;Woman'], dtype=object)

In [105]:
df['Gender'] = df['Gender'].str.replace('Or, in your own words:', 'other', regex=True)
df['Gender'] = df['Gender'].str.replace('Prefer not to say', 'opt_out', regex=True)
df['Gender'] = df['Gender'].str.replace('Non-binary, genderqueer, or gender non-conforming', 'non_binary', regex=True)

df['Gender'] = df['Gender'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

df['Gender'].unique()

array([nan, 'man', 'other', 'woman', 'non_binary', 'opt_out',
       'man;non_binary', 'other;non_binary', 'woman;non_binary',
       'man;woman', 'man;other', 'other;woman;non_binary',
       'man;woman;non_binary', 'other;woman',
       'man;other;woman;non_binary', 'man;other;non_binary',
       'man;other;woman'], dtype=object)

In [106]:
df['Trans'].unique()

array([nan, 'No', 'Or, in your own words:', 'Yes', 'Prefer not to say'],
      dtype=object)

In [107]:
df['Trans'] = df['Trans'].replace({
  'Or, in your own words:', 'other',
  'Prefer not to say', 'opt_out'
})


df['Trans'] = df['Trans'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [108]:
df['Sexuality'].unique()

array([nan, 'Bisexual', 'Straight / Heterosexual',
       'Prefer to self-describe:', 'Bisexual;Straight / Heterosexual',
       'Gay or Lesbian', 'Prefer to self-describe:;Queer',
       'Prefer not to say', 'Queer', 'Straight / Heterosexual;Queer',
       'Straight / Heterosexual;Prefer to self-describe:',
       'Bisexual;Queer', 'Bisexual;Gay or Lesbian',
       'Gay or Lesbian;Queer',
       'Bisexual;Straight / Heterosexual;Prefer to self-describe:;Queer',
       'Bisexual;Prefer to self-describe:',
       'Prefer to self-describe:;Gay or Lesbian;Queer',
       'Straight / Heterosexual;Prefer to self-describe:;Queer',
       'Bisexual;Straight / Heterosexual;Gay or Lesbian',
       'Bisexual;Straight / Heterosexual;Prefer to self-describe:;Gay or Lesbian;Queer',
       'Bisexual;Straight / Heterosexual;Queer',
       'Bisexual;Gay or Lesbian;Queer',
       'Bisexual;Prefer to self-describe:;Queer',
       'Bisexual;Prefer to self-describe:;Gay or Lesbian',
       'Straight / Hete

In [109]:
df['Sexuality'] = df['Sexuality'].str.replace('Straight / Heterosexual', 'heterosexual', regex=True)
df['Sexuality'] = df['Sexuality'].str.replace('Gay or Lesbian', 'homosexual', regex=True)
df['Sexuality'] = df['Sexuality'].str.replace('Prefer to self-describe:', 'other', regex=True)
df['Sexuality'] = df['Sexuality'].str.replace('Prefer not to say', 'opt_out', regex=True)

df['Sexuality'] = df['Sexuality'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')
df['Sexuality'].unique()

array([nan, 'bisexual', 'heterosexual', 'other', 'bisexual;heterosexual',
       'homosexual', 'other;queer', 'opt_out', 'queer',
       'heterosexual;queer', 'heterosexual;other', 'bisexual;queer',
       'bisexual;homosexual', 'homosexual;queer',
       'bisexual;heterosexual;other;queer', 'bisexual;other',
       'other;homosexual;queer', 'heterosexual;other;queer',
       'bisexual;heterosexual;homosexual',
       'bisexual;heterosexual;other;homosexual;queer',
       'bisexual;heterosexual;queer', 'bisexual;homosexual;queer',
       'bisexual;other;queer', 'bisexual;other;homosexual',
       'heterosexual;homosexual', 'other;homosexual',
       'bisexual;heterosexual;other',
       'bisexual;heterosexual;homosexual;queer',
       'heterosexual;homosexual;queer',
       'heterosexual;other;homosexual;queer',
       'bisexual;heterosexual;other;homosexual',
       'bisexual;other;homosexual;queer', 'heterosexual;other;homosexual'],
      dtype=object)

In [110]:
df['Ethnicity'].unique()

array([nan, 'White', 'Or, in your own words:', ...,
       'White;European;North American;Middle Eastern;Asian;Multiracial',
       'White;Middle Eastern;Central American;Hispanic or Latino/a',
       'White;European;North African;Hispanic or Latino/a'], dtype=object)

In [111]:
df['Ethnicity'] = df['Ethnicity'].str.replace("Hispanic or Latino/a", 'hispanic_or_latino', regex=True)
df['Ethnicity'] = df['Ethnicity'].str.replace("I don't know", 'dont_know', regex=True)
df['Ethnicity'] = df['Ethnicity'].str.replace('Or, in your own words:', 'other', regex=True)
df['Ethnicity'] = df['Ethnicity'].str.replace('Prefer not to say', 'opt_out', regex=True)
# remove os parênteses e tudo que tem dentro
df['Ethnicity'] = df['Ethnicity'].str.replace(r"\(.*\)","")

df['Ethnicity'] = df['Ethnicity'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

  df['Ethnicity'] = df['Ethnicity'].str.replace(r"\(.*\)","")


In [112]:
df['Accessibility'].unique()

array([nan, 'None of the above', 'Or, in your own words:',
       'I am deaf / hard of hearing', 'Prefer not to say',
       'I am blind / have difficulty seeing',
       'I am unable to / find it difficult to type',
       'I am unable to / find it difficult to walk or stand without assistance',
       'Or, in your own words:;I am blind / have difficulty seeing',
       'I am unable to / find it difficult to type;I am unable to / find it difficult to walk or stand without assistance',
       'I am deaf / hard of hearing;I am unable to / find it difficult to walk or stand without assistance',
       'I am deaf / hard of hearing;I am blind / have difficulty seeing',
       'I am deaf / hard of hearing;I am blind / have difficulty seeing;I am unable to / find it difficult to type;I am unable to / find it difficult to walk or stand without assistance',
       'Or, in your own words:;I am deaf / hard of hearing;I am blind / have difficulty seeing',
       'I am blind / have difficulty seei

In [113]:
df['Accessibility'] = df['Accessibility'].str.replace("I am deaf / hard of hearing", 'hearing', regex=True)
df['Accessibility'] = df['Accessibility'].str.replace("I am blind / have difficulty seeing", 'seeing', regex=True)
df['Accessibility'] = df['Accessibility'].str.replace("I am unable to / find it difficult to type", 'typing', regex=True)
df['Accessibility'] = df['Accessibility'].str.replace("I am unable to / find it difficult to walk or stand without assistance", 'walking', regex=True)
df['Accessibility'] = df['Accessibility'].str.replace("None of the above", 'none', regex=True)
df['Accessibility'] = df['Accessibility'].str.replace('Or, in your own words:', 'other', regex=True)
df['Accessibility'] = df['Accessibility'].str.replace('Prefer not to say', 'opt_out', regex=True)

df['Accessibility'] = df['Accessibility'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

df['Accessibility'].unique()

array([nan, 'none', 'other', 'hearing', 'opt_out', 'seeing', 'typing',
       'walking', 'other;seeing', 'typing;walking', 'hearing;walking',
       'hearing;seeing', 'hearing;seeing;typing;walking',
       'other;hearing;seeing', 'seeing;walking', 'hearing;typing',
       'other;walking', 'seeing;typing', 'other;hearing',
       'other;hearing;seeing;typing;walking', 'hearing;seeing;walking',
       'other;seeing;typing;walking', 'hearing;seeing;typing',
       'other;hearing;seeing;typing', 'other;typing',
       'other;seeing;typing', 'hearing;typing;walking',
       'other;typing;walking'], dtype=object)

In [114]:
df['MentalHealth'].unique()

array([nan,
       'I have a mood or emotional disorder (e.g., depression, bipolar disorder, etc.);I have an anxiety disorder',
       'None of the above', 'Or, in your own words:',
       'I have a mood or emotional disorder (e.g., depression, bipolar disorder, etc.)',
       'I have a mood or emotional disorder (e.g., depression, bipolar disorder, etc.);I have an anxiety disorder;I have a concentration and/or memory disorder (e.g., ADHD, etc.)',
       'I have a concentration and/or memory disorder (e.g., ADHD, etc.);I have learning differences (e.g., Dyslexic, Dyslexia, etc.)',
       'I have an anxiety disorder',
       "I have autism / an autism spectrum disorder (e.g. Asperger's, etc.)",
       'I have learning differences (e.g., Dyslexic, Dyslexia, etc.)',
       'I have a concentration and/or memory disorder (e.g., ADHD, etc.)',
       'I have an anxiety disorder;I have a concentration and/or memory disorder (e.g., ADHD, etc.)',
       'Prefer not to say',
       "I have a conc

In [115]:
df['MentalHealth'] = df['MentalHealth'].str.replace("I have a mood or emotional disorder \(e.g., depression, bipolar disorder, etc.\)", 'emotional', regex=True)
df['MentalHealth'] = df['MentalHealth'].str.replace("I have an anxiety disorder", 'anxiety', regex=True)
df['MentalHealth'] = df['MentalHealth'].str.replace("I have a concentration and/or memory disorder \(e.g., ADHD, etc.\)", 'concentration', regex=True)
df['MentalHealth'] = df['MentalHealth'].str.replace("I have learning differences \(e.g., Dyslexic, Dyslexia, etc.\)", 'learning', regex=True)
df['MentalHealth'] = df['MentalHealth'].str.replace("I have autism / an autism spectrum disorder \(e.g. Asperger's, etc.\)", 'autism', regex=True)
df['MentalHealth'] = df['MentalHealth'].str.replace("None of the above", 'none', regex=True)
df['MentalHealth'] = df['MentalHealth'].str.replace('Or, in your own words:', 'other', regex=True)
df['MentalHealth'] = df['MentalHealth'].str.replace('Prefer not to say', 'opt_out', regex=True)

df['MentalHealth'] = df['MentalHealth'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

df['MentalHealth'].unique()

array([nan, 'emotional;anxiety', 'none', 'other', 'emotional',
       'emotional;anxiety;concentration', 'concentration;learning',
       'anxiety', 'autism', 'learning', 'concentration',
       'anxiety;concentration', 'opt_out',
       'concentration;learning;autism', 'emotional;concentration;autism',
       'anxiety;other', 'emotional;anxiety;concentration;autism',
       'emotional;autism', 'concentration;autism', 'other;concentration',
       'emotional;anxiety;concentration;learning',
       'emotional;anxiety;autism', 'emotional;concentration',
       'anxiety;concentration;learning;autism',
       'anxiety;concentration;autism', 'anxiety;autism',
       'anxiety;learning', 'emotional;anxiety;learning', 'other;autism',
       'emotional;learning', 'learning;autism',
       'emotional;anxiety;concentration;learning;autism',
       'emotional;anxiety;other', 'emotional;other',
       'emotional;learning;autism', 'emotional;concentration;learning',
       'anxiety;concentration;lea

Aqui termina a limpeza da quarta seção do formulário

-----

## Quinta seção

Questões 45 a 54

In [116]:
df.drop('TBranch', axis=1, inplace=True)

Notar que essa seção só foi respondida por quem é elegível, e optou que sim na questão anterior (TBranch)

In [117]:
df['ICorPM'].unique()

array([nan, 'Independent contributor', 'People manager'], dtype=object)

In [118]:
df['ICorPM'] = df['ICorPM'].str.lower().str.replace(' ', '_')

In [119]:
df['WorkExp'].unique()

array([nan,  6., 14.,  5., 15.,  4., 23.,  9., 22., 21.,  3., 28.,  7.,
       12.,  8.,  2., 20., 27.,  1., 10., 19., 13., 30., 11., 18., 47.,
       17., 46., 25., 24., 31.,  0., 41., 43., 40., 16., 42., 35., 32.,
       34., 26., 36., 38., 29., 33., 44., 37., 50., 45., 39., 48., 49.])

In [120]:
df.drop('Knowledge_1', axis=1, inplace=True)
df.drop('Knowledge_2', axis=1, inplace=True)
df.drop('Knowledge_3', axis=1, inplace=True)
df.drop('Knowledge_4', axis=1, inplace=True)
df.drop('Knowledge_5', axis=1, inplace=True)
df.drop('Knowledge_6', axis=1, inplace=True)
df.drop('Knowledge_7', axis=1, inplace=True)

df.drop('Frequency_1', axis=1, inplace=True)
df.drop('Frequency_2', axis=1, inplace=True)
df.drop('Frequency_3', axis=1, inplace=True)

In [121]:
df['TimeSearching'].unique()

array([nan, '15-30 minutes a day', '30-60 minutes a day',
       '60-120 minutes a day', 'Less than 15 minutes a day',
       'Over 120 minutes a day'], dtype=object)

In [122]:
df['TimeAnswering'].unique()

array([nan, 'Over 120 minutes a day', '60-120 minutes a day',
       'Less than 15 minutes a day', '30-60 minutes a day',
       '15-30 minutes a day'], dtype=object)

In [123]:
df.drop('Onboarding', axis=1, inplace=True)

In [124]:
df['ProfessionalTech'].unique()

array([nan,
       'Innersource initiative;DevOps function;Microservices;Developer portal or other central places to find tools/services;Continuous integration (CI) and (more often) continuous delivery;Automated testing;Observability tools',
       'Innersource initiative;DevOps function;Microservices;Continuous integration (CI) and (more often) continuous delivery;Automated testing;Observability tools',
       'DevOps function;Microservices',
       'Continuous integration (CI) and (more often) continuous delivery;Automated testing',
       'DevOps function;Continuous integration (CI) and (more often) continuous delivery;Automated testing',
       'None of these',
       'DevOps function;Microservices;Continuous integration (CI) and (more often) continuous delivery;Automated testing;Observability tools',
       'Developer portal or other central places to find tools/services;Continuous integration (CI) and (more often) continuous delivery;Automated testing',
       'DevOps function;Co

In [125]:
df['ProfessionalTech'] = df['ProfessionalTech'].str.replace('Developer portal or other central places to find tools/services', 'developer_portal', regex=True)
df['ProfessionalTech'] = df['ProfessionalTech'].str.replace('Continuous integration (CI) and (more often) continuous delivery', 'ci_cd', regex=True)
df['ProfessionalTech'] = df['ProfessionalTech'].str.replace('None of these', 'none', regex=True)
df['ProfessionalTech'] = df['ProfessionalTech'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')
# remove os parênteses e tudo que tem dentro
df['ProfessionalTech'] = df['ProfessionalTech'].str.replace(r"\(.*\)","")

  df['ProfessionalTech'] = df['ProfessionalTech'].str.replace(r"\(.*\)","")


In [126]:
df.drop('TrueFalse_1', axis=1, inplace=True)
df.drop('TrueFalse_2', axis=1, inplace=True)
df.drop('TrueFalse_3', axis=1, inplace=True)

Aqui termina a limpeza da quinta seção do formulário

-----

## Sexta seção

Questões 55 a 56

In [127]:
df.drop('SurveyLength', axis=1, inplace=True)

In [128]:
df.drop('SurveyEase', axis=1, inplace=True)

Aqui termina a limpeza da sexta seção do formulário

-----

## Finalização

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73268 entries, 0 to 73267
Data columns (total 56 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ResponseId                      73268 non-null  int64  
 1   MainBranch                      73268 non-null  object 
 2   Employment                      71709 non-null  object 
 3   RemoteWork                      58958 non-null  object 
 4   CodingActivities                58899 non-null  object 
 5   EdLevel                         71571 non-null  object 
 6   LearnCode                       71580 non-null  object 
 7   LearnCodeOnline                 50685 non-null  object 
 8   LearnCodeCoursesCert            29389 non-null  object 
 9   YearsCode                       71331 non-null  object 
 10  YearsCodePro                    51833 non-null  object 
 11  DevType                         61302 non-null  object 
 12  OrgSize                         

In [130]:
df.to_csv('dataset/survey_results_public_clean.csv', sep=',', encoding='utf-8')