# Цели и задачи проекта

# Первичный анализ данных
Импортируем библиотеки и прочитаем файл с данными:

In [1]:
import pandas as pd
import numpy as np
from math import nan
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from scipy.stats import ttest_ind

pd.set_option('display.max_rows', 50) # показывать больше строк
pd.set_option('display.max_columns', 50) # показывать больше колонок

df = pd.read_csv('stud_math.csv')

df.columns = [col.lower() for col in df.columns]

Рассмотрим, какие данные хранятся в файле и какие типы данных у столбцов:

In [2]:
display(df.head(10))
df.info() 

Unnamed: 0,school,sex,age,address,famsize,pstatus,medu,fedu,mjob,fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,"studytime, granular",higher,internet,romantic,famrel,freetime,goout,health,absences,score
0,GP,F,18,U,,A,4.0,4.0,at_home,teacher,course,mother,2.0,2.0,0.0,yes,no,no,no,yes,-6.0,yes,,no,4.0,3.0,4.0,3.0,6.0,30.0
1,GP,F,17,U,GT3,,1.0,1.0,at_home,other,course,father,1.0,2.0,0.0,no,yes,no,no,no,-6.0,yes,yes,no,5.0,3.0,3.0,3.0,4.0,30.0
2,GP,F,15,U,LE3,T,1.0,1.0,at_home,other,other,mother,1.0,2.0,3.0,yes,no,,no,yes,-6.0,yes,yes,,4.0,3.0,2.0,3.0,10.0,50.0
3,GP,F,15,U,GT3,T,4.0,2.0,health,,home,mother,1.0,3.0,0.0,no,yes,yes,yes,yes,-9.0,yes,yes,yes,3.0,2.0,2.0,5.0,2.0,75.0
4,GP,F,16,U,GT3,T,3.0,3.0,other,other,home,father,1.0,2.0,0.0,no,yes,yes,no,yes,-6.0,yes,no,no,4.0,3.0,2.0,5.0,4.0,50.0
5,GP,M,16,U,LE3,T,4.0,3.0,services,other,reputation,mother,1.0,2.0,0.0,no,yes,yes,yes,yes,-6.0,yes,yes,no,5.0,4.0,2.0,5.0,10.0,75.0
6,GP,M,16,,LE3,T,2.0,2.0,other,other,home,mother,1.0,2.0,0.0,no,no,no,no,yes,-6.0,yes,yes,no,4.0,4.0,4.0,3.0,0.0,55.0
7,GP,F,17,U,GT3,A,4.0,4.0,other,teacher,home,mother,2.0,2.0,0.0,yes,yes,no,no,yes,-6.0,yes,no,no,4.0,1.0,4.0,1.0,6.0,30.0
8,GP,M,15,U,LE3,A,3.0,2.0,services,other,home,mother,1.0,2.0,0.0,no,yes,yes,no,yes,-6.0,yes,yes,no,,2.0,2.0,1.0,0.0,95.0
9,GP,M,15,U,,,3.0,4.0,other,other,home,mother,1.0,2.0,0.0,no,yes,yes,yes,yes,-6.0,yes,yes,no,5.0,5.0,1.0,5.0,0.0,75.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   school               395 non-null    object 
 1   sex                  395 non-null    object 
 2   age                  395 non-null    int64  
 3   address              378 non-null    object 
 4   famsize              368 non-null    object 
 5   pstatus              350 non-null    object 
 6   medu                 392 non-null    float64
 7   fedu                 371 non-null    float64
 8   mjob                 376 non-null    object 
 9   fjob                 359 non-null    object 
 10  reason               378 non-null    object 
 11  guardian             364 non-null    object 
 12  traveltime           367 non-null    float64
 13  studytime            388 non-null    float64
 14  failures             373 non-null    float64
 15  schoolsup            386 non-null    obj

Итого 395 строк (в значениях большинства колонок есть пропуски), 30 столбцов. В датасете 13 числовых столбцов, 17 строковых.

# Функции для работы со столбцами

In [10]:
def get_absence_number(col):
    return (col.isnull() | col.isna()).sum()


def check_discrete_values(col, list_values):
    col_err = col[~col.isin(list_values) & ~col.isnull()]
    if col_err.size == 0:
        print('Данные корректные.')
    else:
        print('Данные не корректные:', col_err.value_counts())


def check_continuous_values(col, list_borders):
    list_borders.sort()
    col_err = col[(col < list_borders[0]) | (col > list_borders[1]) & ~col.isnull()]
    if col_err.size == 0:
        print('Данные корректные.')
    else:
        print('Данные не корректные:', col_err.value_counts())


# col.replace(40, 4, inplace=True)


def remove_value(df, col_name, value):
    if (not pd.isnull(value)) and (not pd.isna(value)):
        return df[df[col_name] != value]
    else:
        return df[~(df[col_name].isnull() | df[col_name].isna())]


def get_percentiles(col):
    list_quant = [col.quantile(0.25), col.quantile(0.75)]
    iqr = list_quant[1] - list_quant[0]
    diap = [list_quant[0] - 1.5*iqr, list_quant[1] + 1.5*iqr]
    return [list_quant, diap, iqr]


def print_column_info(col):
    display(col.value_counts())
    display(sum(col.value_counts().values))
    print(f'Количество пропусков: {get_absence_number(col)}')
    print(f'Количество уникальных значений: {col.nunique()}')
    print(f'Количество значений, встретившихся более 10 раз: {(col.value_counts()>10).sum()}')

    




In [11]:
print_column_info(df['pstatus'])

T    314
A     36
Name: pstatus, dtype: int64

350

Количество пропусков: 45
Количество уникальных значений: 2
Количество значений, встретившихся более 10 раз: 2
