# Limpeza de dados

In [38]:
import numpy as np
import pandas as pd
import re

In [20]:
df = pd.read_csv('dataset/survey_results_public.csv', sep=',', encoding='utf-8')

In [21]:
df.head()

Unnamed: 0,ResponseId,MainBranch,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,YearsCode,...,TimeSearching,TimeAnswering,Onboarding,ProfessionalTech,TrueFalse_1,TrueFalse_2,TrueFalse_3,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,None of these,,,,,,,,,...,,,,,,,,,,
1,2,I am a developer by profession,"Employed, full-time",Fully remote,Hobby;Contribute to open-source projects,,,,,,...,,,,,,,,Too long,Difficult,
2,3,"I am not primarily a developer, but I write co...","Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Friend or family member...,Technical documentation;Blogs;Programming Game...,,14.0,...,,,,,,,,Appropriate in length,Neither easy nor difficult,40205.0
3,4,I am a developer by profession,"Employed, full-time",Fully remote,I don’t code outside of work,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Books / Physical media;School (i.e., Universit...",,,20.0,...,,,,,,,,Appropriate in length,Easy,215232.0
4,5,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Stack Overflow;O...,,8.0,...,,,,,,,,Too long,Easy,


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73268 entries, 0 to 73267
Data columns (total 79 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ResponseId                      73268 non-null  int64  
 1   MainBranch                      73268 non-null  object 
 2   Employment                      71709 non-null  object 
 3   RemoteWork                      58958 non-null  object 
 4   CodingActivities                58899 non-null  object 
 5   EdLevel                         71571 non-null  object 
 6   LearnCode                       71580 non-null  object 
 7   LearnCodeOnline                 50685 non-null  object 
 8   LearnCodeCoursesCert            29389 non-null  object 
 9   YearsCode                       71331 non-null  object 
 10  YearsCodePro                    51833 non-null  object 
 11  DevType                         61302 non-null  object 
 12  OrgSize                         

Sabemos pelo schema que o ResponseId é exatamente o que o nome dá a entender. Então o usaremos como index do DataFrame.

In [23]:
df.set_index('ResponseId')
print()




## Primeira seção

Questões 1 a 18

Então avançamos para a próxima coluna: MainBranch. Ela demostra de que forma o respondente se relaciona com desenvolvimento. Vamos aproveitar para substituir os valores por termos mais simples. 

In [24]:
print(df['MainBranch'].unique())

['None of these' 'I am a developer by profession'
 'I am not primarily a developer, but I write code sometimes as part of my work'
 'I code primarily as a hobby' 'I am learning to code'
 'I used to be a developer by profession, but no longer am']


In [25]:
df['MainBranch'] = df['MainBranch'].replace({
  'None of these': 'none',
  'I am a developer by profession': 'dev',
  'I am not primarily a developer, but I write code sometimes as part of my work': 'part_dev',
  'I code primarily as a hobby': 'hobbist',
  'I am learning to code': 'learner',
  'I used to be a developer by profession, but no longer am': 'ex_dev',
})
print(df['MainBranch'].unique())

['none' 'dev' 'part_dev' 'hobbist' 'learner' 'ex_dev']


Partindo para a próxima coluna (Employment), podemos notar pela forma que ela está formatada que era uma questão com múltiplas seleções. Para facilitar o gerenciamento futuro, optamos por reduzir os nomes.

In [26]:
print(df['Employment'].unique())

[nan 'Employed, full-time' 'Student, full-time' 'Student, part-time'
 'Not employed, but looking for work'
 'Independent contractor, freelancer, or self-employed'
 'Employed, full-time;Independent contractor, freelancer, or self-employed'
 'Employed, part-time' 'Student, part-time;Employed, part-time'
 'Not employed, and not looking for work'
 'Student, full-time;Employed, part-time'
 'Employed, full-time;Student, part-time'
 'Employed, full-time;Student, full-time'
 'Student, part-time;Independent contractor, freelancer, or self-employed'
 'Retired' 'Student, full-time;Not employed, but looking for work'
 'I prefer not to say'
 'Student, full-time;Independent contractor, freelancer, or self-employed'
 'Student, full-time;Not employed, and not looking for work'
 'Not employed, but looking for work;Independent contractor, freelancer, or self-employed'
 'Employed, full-time;Student, part-time;Independent contractor, freelancer, or self-employed'
 'Independent contractor, freelancer, or s

In [27]:
df['Employment'] = df['Employment'].str.replace('Independent contractor, freelancer, or self-employed', 'independent')
df['Employment'] = df['Employment'].str.replace('Not employed, but looking for work', 'unemployed_looking')
df['Employment'] = df['Employment'].str.replace('Not employed, and not looking for work', 'unemployed_not_looking')
df['Employment'] = df['Employment'].str.replace('I prefer not to say', 'opt_out').str.lower()
df['Employment'] = df['Employment'].str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [28]:
df[df['Employment'].isnull()]

Unnamed: 0,ResponseId,MainBranch,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,YearsCode,...,TimeSearching,TimeAnswering,Onboarding,ProfessionalTech,TrueFalse_1,TrueFalse_2,TrueFalse_3,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,none,,,,,,,,,...,,,,,,,,,,
30,31,none,,,,,,,,,...,,,,,,,,,,
86,87,none,,,,,,,,,...,,,,,,,,,,
98,99,dev,,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",,,,,...,,,,,,,,,,
136,137,hobbist,,,,Some college/university study without earning ...,Friend or family member;Other online resources...,Technical documentation;Blogs;Written Tutorial...,,10,...,,,,,,,,Appropriate in length,Easy,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73109,73110,none,,,,,,,,,...,,,,,,,,,,
73157,73158,none,,,,,,,,,...,,,,,,,,,,
73201,73202,none,,,,,,,,,...,,,,,,,,,,
73215,73216,none,,,,,,,,,...,,,,,,,,,,


Com a impressão acima, nos chamou atenção como algumas linhas possuem muitas respostas vazias. Guardaremos essa informação para analisar depois das colunas.

In [29]:
df['RemoteWork'].unique()

array([nan, 'Fully remote', 'Hybrid (some remote, some in-person)',
       'Full in-person'], dtype=object)

In [30]:
df['RemoteWork'] = df['RemoteWork'].replace({
  'Fully remote': 'remote',
  'Hybrid (some remote, some in-person)': 'hybrid',
  'Full in-person': 'in_person'
})

In [31]:
df['CodingActivities'].unique()

array([nan, 'Hobby;Contribute to open-source projects', 'Hobby',
       'I don’t code outside of work',
       'Hobby;Contribute to open-source projects;Bootstrapping a business',
       'Hobby;Contribute to open-source projects;Freelance/contract work',
       'Hobby;Freelance/contract work', 'Hobby;Bootstrapping a business',
       'Other (please specify):', 'Contribute to open-source projects',
       'Hobby;Other (please specify):',
       'Hobby;Contribute to open-source projects;Bootstrapping a business;Freelance/contract work',
       'Bootstrapping a business', 'Freelance/contract work',
       'Hobby;Bootstrapping a business;Freelance/contract work',
       'Bootstrapping a business;Freelance/contract work',
       'Hobby;Contribute to open-source projects;Other (please specify):',
       'Contribute to open-source projects;Freelance/contract work',
       'Hobby;Freelance/contract work;Other (please specify):',
       'Contribute to open-source projects;Bootstrapping a busine

In [32]:
df['CodingActivities'] = df['CodingActivities'].str.replace('Contribute to open-source projects', 'open_source')
df['CodingActivities'] = df['CodingActivities'].str.replace('Bootstrapping a business', 'business')
df['CodingActivities'] = df['CodingActivities'].str.replace('School or academic work', 'school')
df['CodingActivities'] = df['CodingActivities'].str.replace('Freelance/contract work', 'freelance')
df['CodingActivities'] = df['CodingActivities'].str.replace('Other (please specify):', 'other_').str.lower()
df['CodingActivities'] = df['CodingActivities'].str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

  df['CodingActivities'] = df['CodingActivities'].str.replace('Other (please specify):', 'other_').str.lower()


In [33]:
df['EdLevel'].unique()

array([nan, 'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)',
       'Bachelor’s degree (B.A., B.S., B.Eng., etc.)',
       'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
       'Some college/university study without earning a degree',
       'Something else', 'Primary/elementary school',
       'Other doctoral degree (Ph.D., Ed.D., etc.)',
       'Associate degree (A.A., A.S., etc.)',
       'Professional degree (JD, MD, etc.)'], dtype=object)

In [34]:
df['EdLevel'] = df['EdLevel'].replace({
  "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)": 'master',
  "Bachelor’s degree (B.A., B.S., B.Eng., etc.)": 'bachelor',
  "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)": 'high_school',
  "Some college/university study without earning a degree": 'unfinished_college',
  "Something else": 'other',
  "Primary/elementary school": 'primary_school',
  "Other doctoral degree (Ph.D., Ed.D., etc.)": 'doctor',
  "Associate degree (A.A., A.S., etc.)": 'associate',
  "Professional degree (JD, MD, etc.)": 'professional'
})

In [35]:
df['LearnCode'].unique()

array([nan,
       'Books / Physical media;Friend or family member;Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc)',
       'Books / Physical media;School (i.e., University, College, etc)',
       'Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc);On the job training',
       'Other online resources (e.g., videos, blogs, forum)',
       'Online Courses or Certification',
       'On the job training;Coding Bootcamp',
       'Books / Physical media;Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc)',
       'School (i.e., University, College, etc)',
       'Books / Physical media',
       'Books / Physical media;Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc);Online Courses or Certification;Colleague',
       'Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc);On the job training

In [36]:
df['LearnCode'] = df['LearnCode'].str.replace('Books / Physical media', 'books')
df['LearnCode'] = df['LearnCode'].str.replace('Friend or family member', 'someone')
df['LearnCode'] = df['LearnCode'].str.replace('Other online resources (e.g., videos, blogs, forum)', 'online')
df['LearnCode'] = df['LearnCode'].str.replace('School (i.e., University, College, etc)', 'school')
df['LearnCode'] = df['LearnCode'].str.replace('On the job training', 'job')
df['LearnCode'] = df['LearnCode'].str.replace('Online Courses or Certification', 'online_course')
df['LearnCode'] = df['LearnCode'].str.replace('Coding Bootcamp', 'bootcamp')
df['LearnCode'] = df['LearnCode'].str.replace('Hackathons (virtual or in-person)', 'hackathon')
df['LearnCode'] = df['LearnCode'].str.replace('Other (please specify):', 'other_')
df['LearnCode'] = df['LearnCode'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

  df['LearnCode'] = df['LearnCode'].str.replace('Other online resources (e.g., videos, blogs, forum)', 'online')
  df['LearnCode'] = df['LearnCode'].str.replace('School (i.e., University, College, etc)', 'school')
  df['LearnCode'] = df['LearnCode'].str.replace('Hackathons (virtual or in-person)', 'hackathon')
  df['LearnCode'] = df['LearnCode'].str.replace('Other (please specify):', 'other_')


In [44]:
df['LearnCodeOnline'].unique()

array([nan,
       'technical_documentation;blogs;programming_games;written_tutorials;stack_overflow',
       'technical_documentation;blogs;stack_overflow;online_books;video_based_online_courses;online_challenges_',
       ...,
       'technical_documentation;programming_games;written_tutorials;stack_overflow;online_books;video_based_online_courses;online_forum;how_to_videos;interactive_tutorial;coding_sessions_',
       'written_tutorials;online_books;video_based_online_courses;how_to_videos;written_based_online_courses;coding_sessions_;certification_videos',
       'technical_documentation;programming_games;stack_overflow;online_books;video_based_online_courses;how_to_videos;written_based_online_courses;coding_sessions_;certification_videos'],
      dtype=object)

In [43]:
df['LearnCodeOnline'] = df['LearnCodeOnline'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')
# remove os parênteses e tudo que tem dentro
df['LearnCodeOnline'] = df['LearnCodeOnline'].str.replace(r"\(.*\)","")

  df['LearnCodeOnline'] = df['LearnCodeOnline'].str.replace(r"\(.*\)","")


In [45]:
df['LearnCodeCoursesCert'].unique()

array([nan, 'Coursera;Udemy', 'Udemy;Codecademy', 'Coursera;Pluralsight',
       'Coursera;Udemy;Codecademy;edX;Udacity',
       'Coursera;Udemy;Pluralsight;edX', 'Udemy', 'Other',
       'Coursera;Udemy;Udacity', 'Udemy;Pluralsight',
       'Coursera;Udemy;Pluralsight', 'Codecademy', 'Coursera',
       'Coursera;Udemy;edX', 'Udemy;Other', 'Pluralsight',
       'Coursera;Udemy;Codecademy', 'Codecademy;Pluralsight',
       'Coursera;edX', 'Udemy;Codecademy;Pluralsight',
       'Pluralsight;Udacity', 'Coursera;Udemy;Other',
       'Codecademy;Pluralsight;Other',
       'Udemy;Codecademy;Pluralsight;Other', 'Udemy;Pluralsight;Udacity',
       'Coursera;Udemy;Codecademy;Udacity', 'Udemy;edX',
       'Coursera;Udemy;edX;Udacity',
       'Coursera;Pluralsight;edX;Udacity;Other', 'edX',
       'Coursera;Codecademy', 'Coursera;Other', 'Codecademy;Other',
       'Udemy;Codecademy;Pluralsight;edX;Udacity', 'Coursera;Udacity',
       'Udemy;Pluralsight;Other', 'Coursera;Codecademy;Pluralsight;edX

In [46]:
df['LearnCodeCoursesCert'] = df['LearnCodeCoursesCert'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [47]:
df['YearsCode'].unique()

array([nan, '14', '20', '8', '15', '3', '1', '6', '37', '5', '12', '22',
       '11', '4', '7', '13', '36', '2', '25', '10', '40', '16', '27',
       '24', '19', '9', '17', '18', '26', 'More than 50 years', '29',
       '30', '32', 'Less than 1 year', '48', '45', '38', '39', '28', '23',
       '43', '21', '41', '35', '50', '33', '31', '34', '46', '44', '42',
       '47', '49'], dtype=object)

In [48]:
df['YearsCodePro'].unique()

array([nan, '5', '17', '3', '6', '30', '2', '10', '15', '4', '22', '20',
       '40', '9', '14', '21', '7', '18', '25', '8', '12', '45', '1', '19',
       '28', '24', '11', '23', 'Less than 1 year', '32', '27', '16', '44',
       '26', '37', '46', '13', '31', '39', '34', '38', '35', '29', '42',
       '36', '33', '43', '41', '48', '50', 'More than 50 years', '47',
       '49'], dtype=object)

In [51]:
df['DevType'].unique()

array([nan,
       'data_scientist_or_machine_learning_specialist;developer_front_end;engineer_data;engineer_site_reliability',
       'developer_full_stack', ...,
       'data_scientist_or_machine_learning_specialist;developer_front_end;developer_full_stack;developer_back_end;developer_qa_or_test;developer_mobile;database_administrator;cloud_infrastructure_engineer;data_or_business_analyst;designer;blockchain',
       'developer_front_end;developer_full_stack;developer_back_end;developer_desktop_or_enterprise_applications;developer_mobile;educator;developer_embedded_applications_or_devices',
       'developer_front_end;engineer_data;engineer_site_reliability;developer_full_stack;developer_back_end;developer_desktop_or_enterprise_applications;developer_qa_or_test;student;developer_mobile;academic_researcher;devops_specialist;developer_embedded_applications_or_devices;developer_game_or_graphics;cloud_infrastructure_engineer;data_or_business_analyst;designer;scientist;product_manager;sys

In [50]:
df['DevType'] = df['DevType'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')

In [52]:
df['OrgSize'].unique()

array([nan, '20 to 99 employees', '100 to 499 employees', 'I don’t know',
       'Just me - I am a freelancer, sole proprietor, etc.',
       '2 to 9 employees', '5,000 to 9,999 employees',
       '1,000 to 4,999 employees', '10,000 or more employees',
       '500 to 999 employees', '10 to 19 employees'], dtype=object)

Possível análise a se fazer: 
Cruzar tamanho de empresa com salário

In [None]:
df['OrgSize'].unique()

In [53]:
df['PurchaseInfluence'].unique()

array([nan, 'I have some influence', 'I have little or no influence',
       'I have a great deal of influence'], dtype=object)

In [54]:
df.drop('PurchaseInfluence', axis=1, inplace=True)

In [55]:
df['BuyNewTool'].unique()

array([nan, 'Other (please specify):',
       'Start a free trial;Visit developer communities like Stack Overflow',
       'Start a free trial',
       'Start a free trial;Ask developers I know/work with;Research companies that have advertised on sites I visit',
       'Start a free trial;Visit developer communities like Stack Overflow;Ask developers I know/work with',
       'Start a free trial;Ask developers I know/work with',
       'Visit developer communities like Stack Overflow',
       'Start a free trial;Research companies that have advertised on sites I visit',
       'Visit developer communities like Stack Overflow;Ask developers I know/work with',
       'Other (please specify):;Start a free trial;Research companies that have advertised on sites I visit',
       'Ask developers I know/work with',
       'Start a free trial;Visit developer communities like Stack Overflow;Read ratings or reviews on third party sites like G2Crowd',
       'Other (please specify):;Ask developers

In [56]:
df.drop('BuyNewTool', axis=1, inplace=True)

In [57]:
df['Country'].unique()

array([nan, 'Canada',
       'United Kingdom of Great Britain and Northern Ireland', 'Israel',
       'United States of America', 'Germany', 'India', 'Netherlands',
       'Croatia', 'Australia', 'Russian Federation', 'Czech Republic',
       'Austria', 'Serbia', 'Italy', 'Ireland', 'Poland', 'Slovenia',
       'Iraq', 'Sweden', 'Madagascar', 'Norway', 'Taiwan',
       'Hong Kong (S.A.R.)', 'Mexico', 'France', 'Brazil', 'Lithuania',
       'Uruguay', 'Denmark', 'Spain', 'Egypt', 'Turkey', 'South Africa',
       'Ukraine', 'Finland', 'Romania', 'Portugal', 'Singapore', 'Oman',
       'Belgium', 'Chile', 'Bulgaria', 'Latvia', 'Philippines', 'Greece',
       'Belarus', 'Saudi Arabia', 'Kenya', 'Switzerland', 'Iceland',
       'Viet Nam', 'Thailand', 'China', 'Montenegro', 'Slovakia', 'Japan',
       'Luxembourg', 'Turkmenistan', 'Argentina', 'Hungary', 'Tunisia',
       'Bangladesh', 'Maldives', 'Dominican Republic', 'Jordan',
       'Pakistan', 'Nepal', 'Iran, Islamic Republic of...', 'I

In [58]:
df['Country'] = df['Country'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')
# remove os parênteses e tudo que tem dentro
df['Country'] = df['Country'].str.replace(r"\(.*\)","")

  df['Country'] = df['Country'].str.replace(r"\(.*\)","")


In [65]:
df['Currency'].unique()

array([nan, 'CAD', 'GBP', 'ILS', 'USD', 'EUR', 'HRK', 'AUD', 'RUB', 'CZK',
       'PLN', 'NOK', 'TWD', 'HKD', 'INR', 'BRL', 'UYU', 'RSD', 'SEK',
       'DKK', 'EGP', 'TRY', 'RON', 'SGD', 'SAR', 'BGN', 'BYN', 'KES',
       'CHF', 'ZAR', 'ISK', 'THB', 'CNY', 'VND', 'ARS', 'HUF', 'TND',
       'BDT', 'MVR', 'CLP', 'MXN', 'PKR', 'NPR', 'IDR', 'UAH', 'BAM',
       'IMP', 'IRR', 'CRC', 'MUR', 'KZT', 'XOF', 'MAD', 'PHP', 'AMD',
       'NZD', 'AED', 'JPY', 'PEN', 'COP', 'GIP', 'DOP', 'AFN', 'KRW',
       'LBP', 'MYR', 'GTQ', 'AZN', 'LKR', 'UZS', 'QAR', 'NGN', 'MGA',
       'ETB', 'MKD', 'KHR', 'JOD', 'YER', 'ANG', 'FJD', 'PGK', 'UGX',
       'MNT', 'DZD', 'TJS', 'GHS', 'TZS', 'MDL', 'XAF', 'TMT', 'BWP',
       'ALL', 'MMK', 'GEL', 'BHD', 'KWD', 'CVE', 'LYD', 'RWF', 'BMD',
       'BOB', 'KYD', 'VES', 'HNL', 'PYG', 'SYP', 'CUP', 'BTN', 'BZD',
       'JMD', 'BBD', 'NIO', 'ZMW', 'KGS', 'IQD', 'BIF', 'LAK', 'MZN',
       'AOA', 'AWG', 'OMR', 'SDG', 'SHP', 'DJF', 'SCR', 'SZL', 'GNF',
       'TTD', '

In [64]:
# limpa a moeda para ficar apenas o código de 3 letras dela
df['Currency'] = df['Currency'].str.replace(r"[(\t)\s].*","")

  df['Currency'] = df['Currency'].str.replace(r"[(\t)\s].*","")


In [66]:
df['CompTotal'].unique()

array([     nan,   32000.,   60000., ..., 1038000.,   64200.,  439000.])

In [67]:
df.rename(columns={'CompTotal': 'GrossWage', 'CompFreq': 'WageFreq'}, inplace=True)

In [68]:
df['WageFreq'].unique()

array([nan, 'Yearly', 'Monthly', 'Weekly'], dtype=object)

Aqui termina a limpeza da primeira seção do formulário

-----

## Segunda seção

Questões 19 a 32

Aqui termina a limpeza da segunda seção do formulário

-----

## Terceira seção

Questões 33 a 37

Aqui termina a limpeza da terceira seção do formulário

-----

## Quarta seção

Questões 38 a 44

Aqui termina a limpeza da quarta seção do formulário

-----

## Quinta seção

Questões 45 a 54

In [73]:
df.drop('TBranch', axis=1, inplace=True)

Notar que essa seção só foi respondida por quem é elegível, e optou que sim na questão anterior (TBranch)

In [74]:
df['ICorPM'].unique()

array([nan, 'Independent contributor', 'People manager'], dtype=object)

In [75]:
df['ICorPM'] = df['ICorPM'].str.lower().str.replace(' ', '_')

In [76]:
df['WorkExp'].unique()

array([nan,  6., 14.,  5., 15.,  4., 23.,  9., 22., 21.,  3., 28.,  7.,
       12.,  8.,  2., 20., 27.,  1., 10., 19., 13., 30., 11., 18., 47.,
       17., 46., 25., 24., 31.,  0., 41., 43., 40., 16., 42., 35., 32.,
       34., 26., 36., 38., 29., 33., 44., 37., 50., 45., 39., 48., 49.])

**TODO: Knowledge e Frequency**

In [77]:
df['TimeSearching'].unique()

array([nan, '15-30 minutes a day', '30-60 minutes a day',
       '60-120 minutes a day', 'Less than 15 minutes a day',
       'Over 120 minutes a day'], dtype=object)

In [78]:
df['TimeAnswering'].unique()

array([nan, 'Over 120 minutes a day', '60-120 minutes a day',
       'Less than 15 minutes a day', '30-60 minutes a day',
       '15-30 minutes a day'], dtype=object)

In [None]:
df.drop('Onboarding', axis=1, inplace=True)

In [79]:
df['ProfessionalTech'].unique()

array([nan,
       'Innersource initiative;DevOps function;Microservices;Developer portal or other central places to find tools/services;Continuous integration (CI) and (more often) continuous delivery;Automated testing;Observability tools',
       'Innersource initiative;DevOps function;Microservices;Continuous integration (CI) and (more often) continuous delivery;Automated testing;Observability tools',
       'DevOps function;Microservices',
       'Continuous integration (CI) and (more often) continuous delivery;Automated testing',
       'DevOps function;Continuous integration (CI) and (more often) continuous delivery;Automated testing',
       'None of these',
       'DevOps function;Microservices;Continuous integration (CI) and (more often) continuous delivery;Automated testing;Observability tools',
       'Developer portal or other central places to find tools/services;Continuous integration (CI) and (more often) continuous delivery;Automated testing',
       'DevOps function;Co

In [80]:
df['ProfessionalTech'] = df['ProfessionalTech'].str.replace('Developer portal or other central places to find tools/services', 'developer_portal')
df['ProfessionalTech'] = df['ProfessionalTech'].str.replace('Continuous integration (CI) and (more often) continuous delivery', 'ci_cd')
df['ProfessionalTech'] = df['ProfessionalTech'].str.replace('None of these', 'none')
df['ProfessionalTech'] = df['ProfessionalTech'].str.lower().str.replace(', ', '_').str.replace('-', '_').str.replace(' ', '_')
# remove os parênteses e tudo que tem dentro
df['ProfessionalTech'] = df['ProfessionalTech'].str.replace(r"\(.*\)","")

  df['ProfessionalTech'] = df['ProfessionalTech'].str.replace('Continuous integration (CI) and (more often) continuous delivery', 'ci_cd')
  df['ProfessionalTech'] = df['ProfessionalTech'].str.replace(r"\(.*\)","")


In [85]:
df.drop('TrueFalse_1', axis=1, inplace=True)
df.drop('TrueFalse_2', axis=1, inplace=True)
df.drop('TrueFalse_3', axis=1, inplace=True)

Aqui termina a limpeza da quinta seção do formulário

-----

## Sexta seção

Questões 55 a 56

In [69]:
df.drop('SurveyLength', axis=1, inplace=True)

In [70]:
df.drop('SurveyEase', axis=1, inplace=True)

Aqui termina a limpeza da sexta seção do formulário

-----