## Paths

In [1]:
src_path = "../src" #from ./code
out_path = "../out" #from ./code
yearf_pattern = "/stack-overflow-developer-survey-20"
csv_pattern = "/survey_results_public.csv"
schema_pattern = "/survey_results_schema.csv"

## Imports

In [2]:
import pandas as pd

## Functions

In [3]:
# Função que recebe uma lista de paths de arquivos csv e retorna uma lista com as colunas comuns em todos os arquivos de input.
def get_common_headers(csv_files):
    # Lista para armazenar headers de todos os arquivos CSV origem
    all_headers = []
    # Lista para armazenar colunas em comum
    common_columns = None
    for file in csv_files:
        # abrir csv
        df = pd.read_csv(file)
        
        # Obter os readers e adicionar a 'all_headers'
        headers = list(df.columns)
        all_headers.append(headers)
        
        # Atualizar common_columns para o primeiro arquivo
        if common_columns is None:
            common_columns = set(headers)
        # para arquivos seguintes
        else:
            # Achar a interseção entre as colunas atuais e as colunas do arquivo atual
            common_columns = common_columns.intersection(headers)

    # converter as colunas comuns de set para lista.
    common_columns = list(common_columns)
    # DEBUG - função para printar os headers de todos os arquivos (para se ter uma visão abrangente)
    def print_all_headers():
        for i, headers in enumerate(all_headers, start=1):
            print(f"File {i} headers:", headers)

    # print_all_headers()

    # retornar a lista com as colunas em comum
    return common_columns

## Verificação de colunas comuns entre os datasets
Para uma análise comparativa entre os cinco anos do cenário, é importante comparar dados equivalentes entre os datasets.

Nessa etapa, realizar-se-á a geração de um header que contenha somente os elementos comuns entre os datasets.

#### Obtenção dos paths dos arquivos CSV origem

In [4]:
# Obtenção dos paths dos anos 19, 20, 21, 22, 23
csv_files = []
for year in range(19, 24):
    csv_files.append(src_path + yearf_pattern + str(year) + csv_pattern)

In [5]:
# Obtenção dos headers em comum entre todos os CSVs de origem
common_columns = get_common_headers(csv_files)

In [6]:
print("\nDos arquivos:" + ''.join(['\n{}'.format(x) for x in csv_files]) + "\n\n As colunas em comum são:\n" + ''.join(['- {}\n'.format(y) for y in common_columns]))
print("\nQuantidade de colunas em comum: ", len(common_columns))


Dos arquivos:
../src/stack-overflow-developer-survey-2019/survey_results_public.csv
../src/stack-overflow-developer-survey-2020/survey_results_public.csv
../src/stack-overflow-developer-survey-2021/survey_results_public.csv
../src/stack-overflow-developer-survey-2022/survey_results_public.csv
../src/stack-overflow-developer-survey-2023/survey_results_public.csv

 As colunas em comum são:
- SOComm
- EdLevel
- DevType
- SurveyLength
- OrgSize
- Age
- Employment
- SurveyEase
- SOAccount
- YearsCodePro
- CompTotal
- SOPartFreq
- MainBranch
- Country
- YearsCode
- SOVisitFreq


Quantidade de colunas em comum:  16


### Análise Visual
Após uma análise visual entre os datasets, verificou-se que, os datasets dos anos 2021, 2022 e 2023, contém um número maior de colunas em comum. 

Será realizada uma verificação para confirmar a suspeita.

In [7]:
# Obtenção dos paths dos anos 19, 20
csv_files2 = []
for year in range(19,21):
    csv_files2.append(src_path + yearf_pattern + str(year) + csv_pattern)

In [8]:
# Obtenção dos headers em comum entre todos os CSVs de origem
common_columns2 = get_common_headers(csv_files2)

In [9]:
print("\nDos arquivos:" + ''.join(['\n{}'.format(x) for x in csv_files2]) + "\n\n As colunas em comum são:\n" + ''.join(['- {}\n'.format(y) for y in common_columns2]))
print("\nQuantidade de colunas em comum: ", len(common_columns2))


Dos arquivos:
../src/stack-overflow-developer-survey-2019/survey_results_public.csv
../src/stack-overflow-developer-survey-2020/survey_results_public.csv

 As colunas em comum são:
- LanguageWorkedWith
- PlatformDesireNextYear
- CurrencySymbol
- JobSeek
- MainBranch
- EdLevel
- PlatformWorkedWith
- OrgSize
- LanguageDesireNextYear
- MiscTechWorkedWith
- SurveyEase
- CurrencyDesc
- Trans
- CompTotal
- SOPartFreq
- Country
- Respondent
- OpSys
- MiscTechDesireNextYear
- JobFactors
- WorkWeekHrs
- Employment
- Gender
- PurchaseWhat
- YearsCodePro
- Hobbyist
- Age1stCode
- YearsCode
- DatabaseDesireNextYear
- SOVisitFreq
- JobSat
- SOComm
- DevType
- SurveyLength
- ConvertedComp
- WelcomeChange
- Age
- Ethnicity
- CompFreq
- UndergradMajor
- SOAccount
- DatabaseWorkedWith
- Sexuality


Quantidade de colunas em comum:  43


In [10]:
# Obtenção dos paths dos anos 21, 22, 23
csv_files2 = []
for year in range(21, 24):
    csv_files2.append(src_path + yearf_pattern + str(year) + csv_pattern)

In [11]:
# Obtenção dos headers em comum entre todos os CSVs de origem
common_columns2 = get_common_headers(csv_files2)

In [12]:
print("\nDos arquivos:" + ''.join(['\n{}'.format(x) for x in csv_files2]) + "\n\n As colunas em comum são:\n" + ''.join(['- {}\n'.format(y) for y in common_columns2]))
print("\nQuantidade de colunas em comum: ", len(common_columns2))


Dos arquivos:
../src/stack-overflow-developer-survey-2021/survey_results_public.csv
../src/stack-overflow-developer-survey-2022/survey_results_public.csv
../src/stack-overflow-developer-survey-2023/survey_results_public.csv

 As colunas em comum são:
- LanguageHaveWorkedWith
- ToolsTechWantToWorkWith
- DatabaseWantToWorkWith
- PlatformHaveWorkedWith
- MiscTechWantToWorkWith
- MainBranch
- DatabaseHaveWorkedWith
- ConvertedCompYearly
- EdLevel
- LearnCode
- OrgSize
- NEWCollabToolsWantToWorkWith
- MiscTechHaveWorkedWith
- SurveyEase
- CompTotal
- SOPartFreq
- Country
- NEWSOSites
- Currency
- NEWCollabToolsHaveWorkedWith
- WebframeHaveWorkedWith
- PlatformWantToWorkWith
- ResponseId
- Employment
- YearsCodePro
- WebframeWantToWorkWith
- YearsCode
- SOVisitFreq
- LanguageWantToWorkWith
- SOComm
- DevType
- SurveyLength
- Age
- SOAccount
- ToolsTechHaveWorkedWith


Quantidade de colunas em comum:  35


### Mudanças de nomenclatura
Após a análise visual foi notório que, algumas colunas continham nomes diferentes para o mesmo tipo de valor. O próximo passo foi a normalização de nomemclatura.

## 2019

Período da pesquisa: 23 de Janeiro a 14 de Fevereiro de 2019.

In [13]:
year = "19"
filtered_csv = "/survey_results_20" + year + "_filtered.csv"

In [14]:
df = pd.read_csv(src_path + yearf_pattern + year + csv_pattern)

In [15]:
# column_headers = df.columns
# print("Colunas do arquivo .csv origem:", column_headers)
# del column_headers

In [16]:
brazil_rows = df[df['Country'] == 'Brazil']
del df

In [17]:
selected_columns = [
    "Respondent", "MainBranch", "Hobbyist", "OpenSourcer", "Employment", "EdLevel", "OrgSize", "YearsCodePro","YearsCode", "DevType", 
    "ConvertedComp", "LanguageWorkedWith", "LanguageDesireNextYear", "DatabaseWorkedWith", "DatabaseDesireNextYear", "PlatformWorkedWith", "PlatformDesireNextYear",
    "WebFrameWorkedWith", "WebFrameDesireNextYear", "DevEnviron", "OpSys", "Age", "Gender", "MiscTechWorkedWith", "MiscTechDesireNextYear"
                   ]
print("Quantidade de colunas selecionadas:", len(selected_columns))

Quantidade de colunas selecionadas: 25


In [18]:
column_rename_mapping = {
    "Respondent" : "ResponseId",
    "ConvertedComp" : "ConvertedCompYearly",
    "LanguageWorkedWith" : "LanguageHaveWorkedWith",
    "LanguageDesireNextYear" : "LanguageWantToWorkWith",
    "DatabaseWorkedWith" : "DatabaseHaveWorkedWith",
    "DatabaseDesireNextYear" : "DatabaseWantToWorkWith",
    "PlatformWorkedWith" : "PlatformHaveWorkedWith",
    "PlatformDesireNextYear" : "PlatformWantToWorkWith",
    "WebFrameWorkedWith" : "WebframeHaveWorkedWith",
    "WebFrameDesireNextYear" : "WebframeWantToWorkWith",
    "DevEnviron" : "NEWCollabToolsHaveWorkedWith",
    "MiscTechWorkedWith" : "MiscTechHaveWorkedWith",
    "MiscTechDesireNextYear" : "MiscTechWantToWorkWith",
}

In [19]:
filtered_data = brazil_rows[selected_columns].rename(columns=column_rename_mapping)
del brazil_rows
numero_de_linhas = len(filtered_data)
print("Número de linhas:", numero_de_linhas)
print("Número de colunas: ", len(filtered_data.columns))

Número de linhas: 1948
Número de colunas:  25


In [20]:
coding_activities_values = []

# Iterating through rows of the original DataFrame
for index, row in filtered_data.iterrows():
    coding_activities = ""
    
    # Check if "Hobbyst" is "yes" and add to coding activities
    if row["Hobbyist"] == "Yes":
        coding_activities += "Hobby;"

    # Check if "OpenSourcer" is not "never" and add to coding activities
    if row["OpenSourcer"] != "Never" and row["OpenSourcer"] != "":
        coding_activities += "Contribute to open-source projects;"
        
    if coding_activities.endswith(";"):
        coding_activities = coding_activities[:-1]
        
    # Appending the calculated coding activities value to the list
    coding_activities_values.append(coding_activities)

In [21]:
filtered_data["CodingActivities"] = coding_activities_values
filtered_data["NEWCollabToolsHaveWorkedWith"] = None
filtered_data["NEWCollabToolsWantToWorkWith"] = None
del coding_activities_values

In [22]:
filtered_data.drop(columns=['Hobbyist', 'OpenSourcer'], inplace=True)

In [23]:
print(len(filtered_data.columns))
filtered_data.to_csv((out_path + filtered_csv), index=False)

25


In [24]:
del filtered_data

## 2020


In [None]:
year = "20"
filtered_csv = "/survey_results_20" + year + "_filtered.csv"

In [None]:
df = pd.read_csv(src_path + yearf_pattern + year + csv_pattern)

In [None]:
brazil_rows = df[df['Country'] == 'Brazil']
del df

In [None]:
selected_columns = [
    "Respondent", "MainBranch", "Hobbyist", "Employment", "EdLevel", "OrgSize", "YearsCodePro","YearsCode", "DevType", 
    "ConvertedComp", "LanguageWorkedWith", "LanguageDesireNextYear", "DatabaseWorkedWith", "DatabaseDesireNextYear", "PlatformWorkedWith", "PlatformDesireNextYear",
    "WebframeWorkedWith", "WebframeDesireNextYear", "NEWCollabToolsWorkedWith", "NEWCollabToolsDesireNextYear", "OpSys", "Age", "Gender", "MiscTechWorkedWith", "MiscTechDesireNextYear"
                   ]
print("Quantidade de colunas selecionadas:", len(selected_columns))

In [None]:
column_rename_mapping = {
    "Respondent" : "ResponseId",
    "ConvertedComp" : "ConvertedCompYearly",
    "LanguageWorkedWith" : "LanguageHaveWorkedWith",
    "LanguageDesireNextYear" : "LanguageWantToWorkWith",
    "DatabaseWorkedWith" : "DatabaseHaveWorkedWith",
    "DatabaseDesireNextYear" : "DatabaseWantToWorkWith",
    "PlatformWorkedWith" : "PlatformHaveWorkedWith",
    "PlatformDesireNextYear" : "PlatformWantToWorkWith",
    "WebframeWorkedWith" : "WebframeHaveWorkedWith",
    "WebframeDesireNextYear" : "WebframeWantToWorkWith",
    "MiscTechWorkedWith" : "MiscTechHaveWorkedWith",
    "MiscTechDesireNextYear" : "MiscTechWantToWorkWith",
    "NEWCollabToolsWorkedWith" : "NEWCollabToolsHaveWorkedWith",
    "NEWCollabToolsDesireNextYear" : "NEWCollabToolsWantToWorkWith",
}

In [None]:
filtered_data = brazil_rows[selected_columns].rename(columns=column_rename_mapping)
del brazil_rows
numero_de_linhas = len(filtered_data)
print("Número de linhas:", numero_de_linhas)
print("Número de colunas: ", len(filtered_data.columns))

In [None]:
coding_activities_values = []

# Iterating through rows of the original DataFrame
for index, row in filtered_data.iterrows():
    coding_activities = ""
    
    # Check if "Hobbyst" is "yes" and add to coding activities
    if row["Hobbyist"] == "Yes":
        coding_activities += "Hobby;"
        
    if coding_activities.endswith(";"):
        coding_activities = coding_activities[:-1]
        
    # Appending the calculated coding activities value to the list
    coding_activities_values.append(coding_activities)

In [None]:
filtered_data["CodingActivities"] = coding_activities_values
del coding_activities_values

In [None]:
filtered_data.drop(columns=['Hobbyist'], inplace=True)

In [None]:
print(len(filtered_data.columns))
filtered_data.to_csv((out_path + filtered_csv), index=False)

In [None]:
del filtered_data

## 2021

In [None]:
year = "21"
filtered_csv = "/survey_results_20" + year + "_filtered.csv"

In [None]:
df = pd.read_csv(src_path + yearf_pattern + year + csv_pattern)
brazil_rows = df[df['Country'] == 'Brazil']
del df
# print( ''.join(['{}\n'.format(x) for x in brazil_rows.columns]))

In [None]:
selected_columns = [
    "ResponseId", "MainBranch", "Employment", "EdLevel", "OrgSize", "YearsCodePro","YearsCode", "DevType", 
    "ConvertedCompYearly", "LanguageHaveWorkedWith", "LanguageWantToWorkWith", "DatabaseHaveWorkedWith", "DatabaseWantToWorkWith",
    "PlatformHaveWorkedWith", "PlatformWantToWorkWith", "WebframeHaveWorkedWith", "WebframeWantToWorkWith", 
    "NEWCollabToolsHaveWorkedWith", "NEWCollabToolsWantToWorkWith", "OpSys", "Age", "Gender", 
    "MiscTechHaveWorkedWith", "MiscTechWantToWorkWith"
                   ]
print("Quantidade de colunas selecionadas:", len(selected_columns))

In [None]:
filtered_data = brazil_rows[selected_columns]
del brazil_rows

In [None]:
filtered_data['CodingActivities'] = None

In [None]:
print(len(filtered_data.columns))
filtered_data.to_csv((out_path + filtered_csv), index=False)

In [None]:
del filtered_data

## 2022

In [None]:
year = "22"
filtered_csv = "/survey_results_20" + year + "_filtered.csv"

In [None]:
df = pd.read_csv(src_path + yearf_pattern + year + csv_pattern)
brazil_rows = df[df['Country'] == 'Brazil']
del df
# print( ''.join(['{}\n'.format(x) for x in brazil_rows.columns]))

In [None]:
selected_columns = [
    "ResponseId", "MainBranch", "Employment", "EdLevel", "OrgSize", "YearsCodePro","YearsCode", "DevType", 
    "ConvertedCompYearly", "LanguageHaveWorkedWith", "LanguageWantToWorkWith", "DatabaseHaveWorkedWith", "DatabaseWantToWorkWith",
    "PlatformHaveWorkedWith", "PlatformWantToWorkWith", "WebframeHaveWorkedWith", "WebframeWantToWorkWith", 
    "NEWCollabToolsHaveWorkedWith", "NEWCollabToolsWantToWorkWith", "OpSysProfessional use", "Age", "Gender", 
    "MiscTechHaveWorkedWith", "MiscTechWantToWorkWith", "CodingActivities"
                   ]
print("Quantidade de colunas selecionadas:", len(selected_columns))

In [None]:
column_rename_mapping = {
    "OpSysProfessional use" : "OpSys",
}

In [None]:
filtered_data = brazil_rows[selected_columns].rename(columns=column_rename_mapping)
del brazil_rows

In [None]:
print(len(filtered_data.columns))
filtered_data.to_csv((out_path + filtered_csv), index=False)

In [None]:
del filtered_data

## 2023

In [None]:
year = "23"
filtered_csv = "/survey_results_20" + year + "_filtered.csv"

In [None]:
df = pd.read_csv(src_path + yearf_pattern + year + csv_pattern)
brazil_rows = df[df['Country'] == 'Brazil']
del df
# print( ''.join(['{}\n'.format(x) for x in brazil_rows.columns]))

In [None]:
selected_columns = [
    "ResponseId", "MainBranch", "Employment", "EdLevel", "OrgSize", "YearsCodePro","YearsCode", "DevType", 
    "ConvertedCompYearly", "LanguageHaveWorkedWith", "LanguageWantToWorkWith", "DatabaseHaveWorkedWith", "DatabaseWantToWorkWith",
    "PlatformHaveWorkedWith", "PlatformWantToWorkWith", "WebframeHaveWorkedWith", "WebframeWantToWorkWith", 
    "NEWCollabToolsHaveWorkedWith", "NEWCollabToolsWantToWorkWith", "OpSysProfessional use", "Age",
    "MiscTechHaveWorkedWith", "MiscTechWantToWorkWith", "CodingActivities"
                   ]
print("Quantidade de colunas selecionadas:", len(selected_columns))

In [None]:
column_rename_mapping = {
    "OpSysProfessional use" : "OpSys",
}

In [None]:
filtered_data = brazil_rows[selected_columns].rename(columns=column_rename_mapping)
del brazil_rows

In [None]:
filtered_data['Gender'] = None

In [None]:
print(len(filtered_data.columns))
filtered_data.to_csv((out_path + filtered_csv), index=False)

In [None]:
del filtered_data