# Solução


In [1]:
import numpy as np
import pandas as pd

In [2]:
col_names = ['Title', 'Air date', 'Production code', 'Season', 'Number in season',
             'Number in series', 'US viewers (million)', 'Views', 'IMDB rating']

select_cols = ['Title','Air date','Production code','IMDB rating']

df = pd.read_csv('files/simpsons-episodes.tsv',
                 header=None,
                 names=col_names)

df.head()

Unnamed: 0,Title,Air date,Production code,Season,Number in season,Number in series,US viewers (million),Views,IMDB rating
0,Two Cars in Every Garage and Three Eyes on Eve...,,,,,,,,
1,no_val\t15/11/90\t7F08\t2\t6\t19\t25.4\t50691\t8,,,,,,,,
2,Bart the Daredevil\t6/12/90\t7F06\t2\t8\t21\t2...,,,,,,,,
3,Bart Gets Hit by a Car\t10/1/91\tno_val\t2\t10...,,,,,,,,
4,Homer vs. Lisa and the 8th Commandment\t7/2/91...,,,,,,,,


Note que o separador entre os dados é o '\t'

In [3]:
col_names = ['Title', 'Air date', 'Production code', 'Season', 'Number in season',
             'Number in series', 'US viewers (million)', 'Views', 'IMDB rating']

df = pd.read_csv('files/simpsons-episodes.tsv',
                 header=None,
                 names=col_names,
                 sep='\t')

df.head()

Unnamed: 0,Title,Air date,Production code,Season,Number in season,Number in series,US viewers (million),Views,IMDB rating
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,Two Cars in Every Garage and Three Eyes on Eve...,1/11/90,7F01,2.0,4.0,17.0,26.1,64959.0,8.1


Precisamos ignorar as linhas [1,4]

In [4]:
col_names = ['Title', 'Air date', 'Production code', 'Season', 'Number in season',
             'Number in series', 'US viewers (million)', 'Views', 'IMDB rating']

df = pd.read_csv('files/simpsons-episodes.tsv',
                 header=None,
                 names=col_names,
                 sep='\t',
                 skiprows=4)

df.head()

Unnamed: 0,Title,Air date,Production code,Season,Number in season,Number in series,US viewers (million),Views,IMDB rating
0,Two Cars in Every Garage and Three Eyes on Eve...,1/11/90,7F01,2,4,17,26.1,64959.0,8.1
1,no_val,15/11/90,7F08,2,6,19,25.4,50691.0,8
2,Bart the Daredevil,6/12/90,7F06,2,8,21,26.2,57605.0,no_val
3,Bart Gets Hit by a Car,10/1/91,no_val,2,10,23,24.8,56486.0,7.8
4,Homer vs. Lisa and the 8th Commandment,7/2/91,7F13,2,13,26,26.2,58277.0,8


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597 entries, 0 to 596
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Title                 597 non-null    object 
 1   Air date              597 non-null    object 
 2   Production code       597 non-null    object 
 3   Season                597 non-null    object 
 4   Number in season      597 non-null    int64  
 5   Number in series      597 non-null    int64  
 6   US viewers (million)  591 non-null    object 
 7   Views                 593 non-null    float64
 8   IMDB rating           594 non-null    object 
dtypes: float64(1), int64(2), object(6)
memory usage: 42.1+ KB


Nota-se que
- 'no_val' pode ser convertido para NaN
- Air Date não está sendo interpretada como Datetime
- Podemos utilizar a coluna 'Production Code' como sendo o index do dataframe

In [6]:
col_names = ['Title', 'Air date', 'Production code', 'Season', 'Number in season',
             'Number in series', 'US viewers (million)', 'Views', 'IMDB rating']

df = pd.read_csv('files/simpsons-episodes.tsv',
                 header=None,
                 names=col_names,
                 sep='\t',
                 skiprows=4,
                 encoding='UTF-8',
                 na_values='no_val',
                 parse_dates=['Air date'],
                 index_col='Production code')

df.head()

  df = pd.read_csv('files/simpsons-episodes.tsv',


Unnamed: 0_level_0,Title,Air date,Season,Number in season,Number in series,US viewers (million),Views,IMDB rating
Production code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7F01,Two Cars in Every Garage and Three Eyes on Eve...,1990-01-11,2.0,4,17,26.1,64959.0,8.1
7F08,,1990-11-15,2.0,6,19,25.4,50691.0,8.0
7F06,Bart the Daredevil,1990-06-12,2.0,8,21,26.2,57605.0,
,Bart Gets Hit by a Car,1991-10-01,2.0,10,23,24.8,56486.0,7.8
7F13,Homer vs. Lisa and the 8th Commandment,1991-07-02,2.0,13,26,26.2,58277.0,8.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 597 entries, 7F01 to SABF13
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Title                 575 non-null    object        
 1   Air date              597 non-null    datetime64[ns]
 2   Season                594 non-null    float64       
 3   Number in season      597 non-null    int64         
 4   Number in series      597 non-null    int64         
 5   US viewers (million)  588 non-null    float64       
 6   Views                 593 non-null    float64       
 7   IMDB rating           592 non-null    float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 42.0+ KB


Como foi especificado somente algumas colunas do dataframe, vamos dropar as desnecessarias

In [8]:
select_cols = ['Title','Air date','Production code','IMDB rating']

colsToDrop = []
for col in df.columns:
    if col not in select_cols:
        colsToDrop.append(col)

df = df.drop(columns=colsToDrop)
df.head()

Unnamed: 0_level_0,Title,Air date,IMDB rating
Production code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7F01,Two Cars in Every Garage and Three Eyes on Eve...,1990-01-11,8.1
7F08,,1990-11-15,8.0
7F06,Bart the Daredevil,1990-06-12,
,Bart Gets Hit by a Car,1991-10-01,7.8
7F13,Homer vs. Lisa and the 8th Commandment,1991-07-02,8.0


# Testes


In [9]:
def test_df_columns():
    return list(df.columns) == ['Title', 'Air date', 'IMDB rating']

def test_df_item():
    return df.iloc[234, 2] == 6.6

def test_df_shape():
    return df.shape == (597, 3)


In [10]:
test_df_columns()

True

In [11]:
test_df_item()

True

In [12]:
test_df_shape()

True