In [439]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [440]:
file_train = "train.csv"
file_test = "test.csv"

df_raw_train = pd.read_csv(file_train, index_col=False)
df_raw_test = pd.read_csv(file_test, index_col=False)
df.shape, df_test.shape

((13730, 167), (4576, 31))

In [441]:
features=['NU_INSCRICAO','CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE', 'TP_SEXO', 'TP_COR_RACA', 
          'TP_NACIONALIDADE','TP_ST_CONCLUSAO','TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO',
          'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC','NU_NOTA_CN', 'NU_NOTA_CH', 
          'NU_NOTA_LC','TP_LINGUA','TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 
          'NU_NOTA_COMP3','NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO','Q001', 'Q002',
          'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047','NU_NOTA_MT']

train_numerical_vars = ['NU_INSCRICAO','NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 
                  'NU_NOTA_COMP4','NU_NOTA_COMP5', 'NU_NOTA_REDACAO','NU_NOTA_MT'] 
test_numerical_vars = ['NU_INSCRICAO','NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 
                  'NU_NOTA_COMP4','NU_NOTA_COMP5', 'NU_NOTA_REDACAO'] 
target = ['NU_NOTA_MT']

In [442]:
df_train=df_raw_train[features]
df_train.drop(['SG_UF_RESIDENCIA','TP_ENSINO'], inplace=True, axis=1)

df_test=df_raw_test[features[:-1]]
df_test.drop(['SG_UF_RESIDENCIA','TP_ENSINO'], inplace=True, axis=1)

df_train[train_numerical_vars] = df_train[train_numerical_vars].fillna(0)
df_test[test_numerical_vars] = df_test[test_numerical_vars].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [443]:
df_train_clean = pd.DataFrame(index=df_train.index)
df_test_clean = pd.DataFrame(index=df_test.index)


In [326]:
#df_train_clean.info(), df_test_clean.info()
#df_test_clean

# Limpeza dos dados e categorização

In [444]:
df_train_clean['NU_INSCRICAO'] = df_train['NU_INSCRICAO']
df_test_clean['NU_INSCRICAO'] = df_test['NU_INSCRICAO']

In [446]:
def create_encoder(column, prefix):

    #encoder = OneHotEncoder()
    #train_column_df = pd.DataFrame(encoder.fit_transform(df_train[[column]]).toarray())
    #test_column_df = pd.DataFrame(encoder.fit_transform(df_test[[column]]).toarray())
    
    train_column_df = pd.get_dummies(df_train[column])
    test_column_df = pd.get_dummies(df_test[column])
    
    train_name_columns = df_train[column].sort_values().unique()
    train_name_columns_co = [str(prefix) + str(train_name_column) for train_name_column in train_name_columns]
    
    test_name_columns = df_test[column].sort_values().unique()
    test_name_columns_co = [str(prefix) + str(test_name_column) for test_name_column in test_name_columns] 
    
    train_column_df.columns=train_name_columns_co
    test_column_df.columns=test_name_columns_co
    
    global df_train_clean
    global df_test_clean
    
    df_train_clean = pd.concat([df_train_clean, train_column_df ], axis=1)
    df_test_clean = pd.concat([df_test_clean, test_column_df ], axis=1)



In [453]:
categorical_vars = {'CO_UF_RESIDENCIA' : 'co_uf_', 'TP_SEXO' : 'sexo_', 'TP_COR_RACA': 'raca_', 'TP_ST_CONCLUSAO': 'tp_st_con_', 
                    'TP_ANO_CONCLUIU': 'tp_ano_con_', 'TP_ESCOLA': 'tp_esc_','TP_PRESENCA_CN': 'tp_pres_cn', 
                    'TP_PRESENCA_CH': 'tp_pres_ch', 'TP_PRESENCA_LC': 'tp_pres_lc', 'TP_LINGUA': 'tp_ling_', 
                    'Q001': 'q001_', 'Q002': 'q002_', 'Q006': 'q006_', 'Q024': 'q024_',
                    'Q025': 'q025_', 'Q026': 'q026_', 'Q047': 'q047_'}

#'TP_STATUS_REDACAO': 'tp_stat_red_', 'Q027': 'q027_', 

In [454]:
for column, prefix in categorical_vars.items():
    create_encoder(column, prefix)

In [1]:
#Inserindo as numericas

train_numerical_vars = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 
                  'NU_NOTA_COMP4','NU_NOTA_COMP5', 'NU_NOTA_REDACAO'] 
test_numerical_vars = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 
                  'NU_NOTA_COMP4','NU_NOTA_COMP5', 'NU_NOTA_REDACAO']

df_train_clean = pd.concat([df_train_clean, df_train[train_numerical_vars]], axis=1)
df_test_clean = pd.concat([df_test_clean, df_test[test_numerical_vars]], axis=1)

NameError: name 'pd' is not defined

In [457]:
X_train = df_train_clean.loc[:,'co_uf_11':]
y_train = df_train['NU_NOTA_MT']

In [458]:
X_test = df_test_clean.loc[:,'co_uf_11':]
X_train.shape, y_train.shape, X_test.shape

((13730, 122), (13730,), (4576, 119))

In [459]:
X_train_comp_X_test = X_train[X_test.columns]

In [460]:
X_train_comp_X_test.shape, y_train.shape, X_test.shape

((13730, 119), (13730,), (4576, 119))

In [461]:
regressor = LinearRegression()
regressor.fit(X_train_comp_X_test, y_train)

LinearRegression()

In [462]:
y_pred = regressor.predict(X_test)

In [409]:
X_test.head(5)

Unnamed: 0,co_uf_11,co_uf_12,co_uf_13,co_uf_14,co_uf_15,co_uf_16,co_uf_17,co_uf_21,co_uf_22,co_uf_23,co_uf_24,co_uf_25,co_uf_26,co_uf_27,co_uf_28,co_uf_29,co_uf_31,co_uf_32,co_uf_33,co_uf_35,co_uf_41,co_uf_42,co_uf_43,co_uf_50,co_uf_51,co_uf_52,co_uf_53,sexo_F,sexo_M,raca_0,raca_1,raca_2,raca_3,raca_4,raca_5,tp_st_con_1,tp_st_con_2,tp_st_con_3,tp_st_con_4,tp_ano_con_0,tp_ano_con_1,tp_ano_con_2,tp_ano_con_3,tp_ano_con_4,tp_ano_con_5,tp_ano_con_6,tp_ano_con_7,tp_ano_con_8,tp_ano_con_9,tp_ano_con_10,tp_esc_1,tp_esc_2,tp_esc_3,tp_pres_cn0,tp_pres_cn1,tp_pres_ch0,tp_pres_ch1,tp_pres_lc0,tp_pres_lc1,tp_pres_lc2,tp_ling_0,tp_ling_1,q001_A,q001_B,q001_C,q001_D,q001_E,q001_F,q001_G,q001_H,q002_A,q002_B,q002_C,q002_D,q002_E,q002_F,q002_G,q002_H,q006_A,q006_B,q006_C,q006_D,q006_E,q006_F,q006_G,q006_H,q006_I,q006_J,q006_K,q006_L,q006_M,q006_N,q006_O,q006_P,q006_Q,q024_A,q024_B,q024_C,q024_D,q024_E,q026_A,q026_B,q026_C,q047_A,q047_B,q047_C,q047_D,q047_E
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0


In [410]:
X_train.head(5)

Unnamed: 0,co_uf_11,co_uf_12,co_uf_13,co_uf_14,co_uf_15,co_uf_16,co_uf_17,co_uf_21,co_uf_22,co_uf_23,co_uf_24,co_uf_25,co_uf_26,co_uf_27,co_uf_28,co_uf_29,co_uf_31,co_uf_32,co_uf_33,co_uf_35,co_uf_41,co_uf_42,co_uf_43,co_uf_50,co_uf_51,co_uf_52,co_uf_53,sexo_F,sexo_M,raca_0,raca_1,raca_2,raca_3,raca_4,raca_5,tp_st_con_1,tp_st_con_2,tp_st_con_3,tp_st_con_4,tp_ano_con_0,tp_ano_con_1,tp_ano_con_2,tp_ano_con_3,tp_ano_con_4,tp_ano_con_5,tp_ano_con_6,tp_ano_con_7,tp_ano_con_8,tp_ano_con_9,tp_ano_con_10,tp_esc_1,tp_esc_2,tp_esc_3,tp_esc_4,tp_pres_cn0,tp_pres_cn1,tp_pres_cn2,tp_pres_ch0,tp_pres_ch1,tp_pres_ch2,tp_pres_lc0,tp_pres_lc1,tp_pres_lc2,tp_ling_0,tp_ling_1,q001_A,q001_B,q001_C,q001_D,q001_E,q001_F,q001_G,q001_H,q002_A,q002_B,q002_C,q002_D,q002_E,q002_F,q002_G,q002_H,q006_A,q006_B,q006_C,q006_D,q006_E,q006_F,q006_G,q006_H,q006_I,q006_J,q006_K,q006_L,q006_M,q006_N,q006_O,q006_P,q006_Q,q024_A,q024_B,q024_C,q024_D,q024_E,q026_A,q026_B,q026_C,q047_A,q047_B,q047_C,q047_D,q047_E
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0


In [463]:
df_result_insc = pd.DataFrame(df_test_clean['NU_INSCRICAO'])

In [464]:
resultado = pd.concat([df_result_insc, pd.DataFrame(np.round(y_pred,3))], axis=1)
resultado.reset_index(inplace=True, drop=True)

In [465]:
resultado.columns=['NU_INSCRICAO', 'NU_NOTA_MT']

In [420]:
resultado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4576 entries, 0 to 4575
Data columns (total 2 columns):
NU_INSCRICAO    4576 non-null object
NU_NOTA_MT      4576 non-null float64
dtypes: float64(1), object(1)
memory usage: 71.6+ KB


In [466]:
resultado.to_csv("answer.csv", index=False)