In [2]:
import skfuzzy as fuzz
import pandas as pd
import numpy as np
import re
df = pd.read_csv('fuzzy_dataset.csv')

In [3]:
def transform_value(value):
    try:
        return int(str(value).rstrip(','))
    except:
        return str(value).rstrip(',')
    
medidas = df\
.groupby(by=['integralizacao'])\
.agg([np.min, np.mean, np.max])\
.transpose()\
.reset_index()\
.applymap(str)\
.apply(lambda x: x + ',')\
.groupby('level_0')\
.apply(lambda x: x.sum())\
.rename(columns=lambda x: transform_value(x))

medidas['level_0'] = medidas['level_0'].apply(lambda x: x.split(',')[0] )
drop = ['id', 'Unnamed: 0']
medidas = medidas[~medidas['level_0'].isin(drop)]


valores = df\
.applymap(str)\
.apply(lambda x: x + ',')\
.groupby(by=['integralizacao'])\
.agg([np.sum])\
.transpose()\
.reset_index()\
.rename(columns=lambda x: transform_value(x))

valores = valores[~valores['level_0'].isin(drop)]

def gen_dict(table):
    table.index = table['level_0']
    table_dict = table\
    .iloc[:, 2:]\
    .to_dict()
    
    for key in table_dict.keys():
        inner_dict = table_dict[key]
        for key2 in inner_dict.keys():
            inner_dict[key2] = inner_dict[key2].split(',')
            inner_dict[key2] = [ x for x in inner_dict[key2] if re.match("^(?=.)([+-]?([0-9]*)(\.([0-9]+))?)$", x) is not None ]
            inner_dict[key2] = [ round(float(x), 2) for x in inner_dict[key2] ]
            inner_dict[key2] = sorted(list(set(inner_dict[key2])))
            
    return table_dict

In [4]:
import seaborn as sns

In [5]:
def flatten_dict(df):
    tmp = gen_dict(df)
    rows = []
    for key in sorted(tmp.keys()):
        for key2 in sorted(tmp[key].keys()):
            for value in tmp[key][key2]:
                row = {
                    "integralizacao": key,
                    "campo": key2,
                    "valor": value
                }
                rows.append(row)
    return pd.DataFrame(rows)

dataplot = flatten_dict(valores)
dataplot = dataplot[dataplot['campo'].str.contains('media')]
dataplot = dataplot\
.groupby(by=['integralizacao', 'campo'])\
.agg([np.mean])\
.reset_index()
dataplot.columns = dataplot.columns.get_level_values(0)
dataplot['valor'] = dataplot['valor'].apply(lambda x: 1 if x >= 6.0 else 0)
dataplot

Unnamed: 0,integralizacao,campo,valor
0,0,media_NC,0
1,0,media_NE,0
2,0,media_matematica,0
3,0,media_programacao,0
4,1,media_NC,0
5,1,media_NE,0
6,1,media_matematica,0
7,1,media_programacao,0
8,2,media_NC,1
9,2,media_NE,0


In [6]:
import pandas as pd  
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

def regressao(df, idx):
    rows = []
    tmp = df[df['integralizacao']==idx].iloc[:, 3:]
    for item in tmp.columns:
        for item2 in tmp.columns:
            if item != item2:
                tmp2 = tmp[[item, item2]].dropna()
                X = tmp2[item].values.reshape(-1, 1)
                y = tmp2[item2].values.reshape(-1, 1)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
                regressor = LinearRegression()
                regressor.fit(X_train, y_train)
                y_pred = regressor.predict(X_test)
                row = { 'mse' : metrics.mean_absolute_error(y_test, y_pred),
                       'a' : item,
                       'b' : item2 }
                rows.append(row)
    tmp_df = pd.DataFrame(rows).sort_values(by='mse')
    tmp_df['a'] = tmp_df['a'].combine(tmp_df['b'], lambda x,y: x + ',' + y)
    tmp_df = tmp_df[['mse', 'a']]
    tmp_df = tmp_df[tmp_df['mse'] != 0]
    return tmp_df.head(5)

In [7]:
result = regressao(df, 0)
result['integralizacao'] = 0
for item in set(df['integralizacao'].values):
    if item != 0:
        tmp = regressao(df, item)
        tmp['integralizacao'] = item
        result = result.append(tmp)

In [8]:
result['mse'] = result['mse'].round(2)
result

Unnamed: 0,mse,a,integralizacao
2,0.48,"media_programacao,media_NC",0
24,0.5,"media_NC,media_programacao",0
61,0.53,"reprovacoes_NC,reprovacoes_matematica",0
47,0.7,"reprovacoes_matematica,quantidade_trancamentos",0
15,0.73,"media_matematica,quantidade_trancamentos",0
61,0.67,"reprovacoes_NC,reprovacoes_matematica",1
63,0.69,"reprovacoes_NC,quantidade_trancamentos",1
31,0.7,"media_NC,quantidade_trancamentos",1
24,0.71,"media_NC,media_programacao",1
2,0.73,"media_programacao,media_NC",1


In [9]:
df

Unnamed: 0.1,Unnamed: 0,id,integralizacao,media_programacao,media_matematica,media_NE,media_NC,reprovacoes_programacao,reprovacoes_matematica,reprovacoes_NE,reprovacoes_NC,quantidade_trancamentos
0,0,1,0,6.550000,7.300000,,6.700000,1.0,0.0,,1.0,1
1,1,1,1,5.400000,5.200000,3.800000,5.500000,6.0,1.0,1.0,7.0,1
2,2,1,2,4.956250,4.200000,4.709091,5.283333,12.0,1.0,9.0,4.0,1
3,3,1,3,4.822222,,4.466667,6.028571,13.0,,10.0,3.0,1
4,4,1,4,7.066667,6.900000,7.066667,6.900000,0.0,0.0,0.0,0.0,1
5,5,2,0,6.800000,4.000000,,6.100000,1.0,1.0,,2.0,0
6,6,2,1,7.600000,7.200000,,7.680000,1.0,0.0,,1.0,0
7,7,2,2,5.600000,6.850000,,6.016667,2.0,0.0,,2.0,0
8,8,2,3,7.200000,,6.000000,7.500000,0.0,,0.0,0.0,0
9,9,2,4,8.000000,,8.950000,7.933333,1.0,,0.0,1.0,0
