# Разбиение на тестовую и обучающую выборку в Python

Задача:  разбить набор данных на обучающую и тестовую выборки

In [20]:
# Load relevant libraries.

%pylab inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.api import abline_plot
import patsy
import seaborn as sns
sns.set(context='notebook', style='whitegrid', palette='deep', font='sans-serif', font_scale=1, rc=None)
import sklearn as skl

Populating the interactive namespace from numpy and matplotlib


In [21]:
# Spam database
target_url = "https://raw.githubusercontent.com/samujjwaal/Spam-Email-Classifier/master/spambase.csv"

spam = pd.read_csv(target_url)
print(spam.info())
print(spam['class'].describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 

In [22]:
spam.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


## Вариант решения

### Использовать **pandas**

In [23]:
spamtrain = spam.sample(frac = 0.67, random_state = 1066)
spamtest = spam.drop(spamtrain.index)

# Confirm data has been split properly.
print(spamtrain['class'].count())
print(spamtest['class'].count())
print(spam['class'].count())

3083
1518
4601


## Вариант решения

### Использовать **train_test_split** из **sklearn.model_selection**

In [24]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("north_korea_missile_test_database.csv")
y = df["Missile Name"]
X = df.drop("Missile Name", axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=31
)

In [25]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=31
)

In [26]:
len(X_train)

81

In [27]:
len(X_val)

27

In [28]:
len(X_test)

27

In [29]:
print(len(X_train))
print(len(y_train))
print(len(X_val))
print(len(y_val))
print(len(X_test))
print(len(y_test))

81
81
27
27
27
27


# Задания:
1. Подготовить pandas dataframe на основе "сырых" данных - https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/ **spambase.data spambase.names**
2. Провести его анализ на предмет сбалансированности классов.
3. Произвести разбиение на тестовую обучающую выборку с использованием https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.htmlhttps://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html **sklearn.model_selection.StratifiedShuffleSplit** в соотношении **80/20**, **70/30**


# Задания выполнил студент группы ББМО-01-23 Агасиев Максим Артурович

## 1. Подготовка pandas dataframe на основе "сырых" данных

In [30]:
import requests as r
import re

data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'
names_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names'

# Извлечение имён полей в датасете из файла spambase.names
names = r.get(names_url).text
attribute_names = re.findall(r"^(\S+):", names, re.MULTILINE)
attribute_names.append('class')

In [31]:
task_df = pd.read_csv(data_url, names = attribute_names)
task_df

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


## 2. Анализ датасета на предмет сбалансированности классов

In [32]:
grouped = task_df.groupby(task_df['class'])
grouped['class'].count()

class
0    2788
1    1813
Name: class, dtype: int64

## 3. Разбиение на тестовую обучающую выборку в соотношении 80/20 и 70/30

In [33]:
from sklearn.model_selection import StratifiedShuffleSplit

X = task_df.drop('class', axis=1)
y = task_df['class']

sss_20 = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 1715)

for train, test in sss_20.split(X, y):
    X_train_20, X_test_20 = X.iloc[train], X.iloc[test]
    y_train_20, y_test_20 = y.iloc[train], y.iloc[test]

print(f"В соотношении 80/20: {len(X_train_20)} обучающих и {len(X_test_20)} тестовых. ")

sss_30 = StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 1715)

for train, test in sss_30.split(X, y):
    X_train_30, X_test_30 = X.iloc[train], X.iloc[test]
    y_train_30, y_test_30 = y.iloc[train], y.iloc[test]

print(f"В соотношении 70/30: {len(X_train_30)} обучающих и {len(X_test_30)} тестовых. ")

В соотношении 80/20: 3680 обучающих и 921 тестовых. 
В соотношении 70/30: 3220 обучающих и 1381 тестовых. 
