# Processing the File astronauts.csv

Given the file astronauts.csv, we will create a function that transforms the data as follows:
<br>
a) Shows average missions and average days in space.<br>
b) States % of astronauts in space by country.<br>
c) Shows company that has sent the most missions by country.<br>
d) Writes new dataframe into csv file.<br>

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('astronauts.csv')

In [3]:
df.head(5)

Unnamed: 0,s.no/code given,Astronaut name,country,company space agency,missions,days in space,spacewalks,days spacewalking,acheivement 1,acheivement 2,acheivement 3,acheivement 4,still in space
0,612,Cameron Bess,United States of America,Blue Origin,1,<.1,-,-,Crossed Kármán Line,,,,
1,611,Lane Bess,United States of America,Blue Origin,1,<.1,-,-,Crossed Kármán Line,,,,
2,610,Evan Dick,United States of America,Blue Origin,1,<.1,-,-,Crossed Kármán Line,,,,
3,609,Dylan Taylor,United States of America,Blue Origin,1,<.1,-,-,Crossed Kármán Line,,,,
4,608,Michael Strahan,United States of America,Blue Origin,1,<.1,-,-,Crossed Kármán Line,,,,


In [4]:
df.info

<bound method DataFrame.info of     s.no/code given  Astronaut name                    country  \
0               612     Cameron Bess  United States of America   
1               611        Lane Bess  United States of America   
2               610        Evan Dick  United States of America   
3               609     Dylan Taylor  United States of America   
4               608  Michael Strahan  United States of America   
..              ...              ...                       ...   
947             NaN  Sergey Korsakov                    Russia   
948             NaN     Eytan Stibbe                    Israel   
949             NaN     Robert Hines  United States of America   
950             NaN    Denis Matveev                    Russia   
951             NaN    Deng Qingming                     China   

    company space agency missions days in space spacewalks days spacewalking  \
0            Blue Origin        1           <.1          -                 -   
1            Bl

In [5]:
df.columns

Index(['s.no/code given', 'Astronaut name ', 'country', 'company space agency',
       'missions', 'days in space', 'spacewalks', 'days spacewalking',
       'acheivement 1', 'acheivement 2', 'acheivement 3', 'acheivement 4',
       'still in space'],
      dtype='object')

In [6]:
def process_astronauts(file):
    
    # Preprocess original file
    df = pd.read_csv(file)
    df.drop(columns = ['s.no/code given','spacewalks','days spacewalking','still in space'],axis = 1, inplace = True)
    df['missions'] = pd.to_numeric(df['missions'],errors = 'coerce')
    df['days in space'] = pd.to_numeric(df['days in space'],errors = 'coerce')
    
    #Create new dataframe
    df2 = df[['country']].drop_duplicates().reset_index(drop = True)
    df2.rename(columns = {'country':'Country'}, inplace = True)
    
    # Calculate total missions per country
    missions = []
    i=0
    while(i<len(df2)):
        missions.append(int(df[df['country'] == df2['Country'][i]]['missions'].sum()))
        i+=1
    df2['Total missions']=missions
    
    # Calculate average days in space per country
    days_in_space = []
    i=0
    while(i<len(df2)):
        days_in_space.append(df[df['country'] == df2['Country'][i]]['days in space'].sum()/df['country'].value_counts()[i])
        i+=1
    df2['Avg. days in space'] = days_in_space
    
    # Calculate percentage of astronauts per country
    astronauts = []
    i=0
    while(i<len(df2)):    
        astronauts.append(len(pd.unique(df[df['country'] == df2['Country'][i]]['Astronaut name '])))
        i+=1
    df2['Total astronauts' ]= astronauts
    df2['% of astronauts'] = astronauts/df2['Total astronauts'].sum()
    df2.drop(columns = ['Total astronauts'],axis = 1, inplace = True)
    
    # Calculate company with most missions
    companies = []
    i=0
    while(i<len(df2)):
        companies.append(pd.Series(df[df['country'] == df2['Country'][i]]['company space agency'].mode()).values)
        i+=1 
    df2['Company with the most missions'] = companies
    
    # Write new dataframe to csv file
    df2.to_csv('processed_astronauts.csv', encoding='utf-8', index=False)
        
    return df2

In [7]:
process_astronauts('astronauts.csv')

Unnamed: 0,Country,Total missions,Avg. days in space,% of astronauts,Company with the most missions
0,United States of America,1057,45.439962,0.549947,[NASA]
1,Japan,33,10.207602,0.024185,[NASDA]
2,Germany,17,8.482609,0.012618,[DLR]
3,China,27,38.278261,0.022082,[CMS]
4,Australia,1,0.0,0.001052,[Blue Origin]
5,Canada,19,44.375,0.012618,[CSA]
6,Russia,188,1585.433333,0.120925,[Roscosmos]
7,Netherlands,4,17.558333,0.003155,[ESA]
8,UK,6,27.685714,0.004206,[Virgin Galactic]
9,United Arab Emirates,1,1.316667,0.001052,[MBRSC]
