# 資料產生

### 載入函式庫 🙄

In [1]:
import pandas as pd
import numpy as np
import random

### 定義 attributes 以及對應 value 的範圍

In [2]:
data_range = {}
data_range["Avg sleep time"] = (0, 24)
data_range["Avg study time"] = (0, 24)
data_range["Avg video game time"] = (0, 24)
data_range["BMI"] = (19, 35)
data_range["In a relationship"] = ["Yes", "No"]
data_range["Family financial status"] = ["high", "average", "low"]
data_range["GPA"] = (0, 4.3)
data_range["Laptop brand"] = ["Leveno", "HP", "Dell", "Acer", "Asus", "Apple"]
data_range["TOEIC grade"] = (0, 990)
data_range["IQ"] = (100, 180)
data_range["Grade"] = ["80+", "60~80", "60-"]

### Attributes

In [3]:
attributes = list(data_range.keys())

In [4]:
attributes

['Avg sleep time',
 'Avg study time',
 'Avg video game time',
 'BMI',
 'In a relationship',
 'Family financial status',
 'GPA',
 'Laptop brand',
 'TOEIC grade',
 'IQ',
 'Grade']

### 定義資料筆數

In [5]:
data_points_numbers = 1000

### 定義機率

In [6]:
IQ_probs = ([0.969/50] * 50) + ([0.001] * 31)
GPA_probs = ([0.00001] * 18) + ([0.99982 / 26] * 26)
sleep_probs = ([0] * 3) + [0.02, 0.05, 0.12, 0.21, 0.24, 0.18, 0.09, 0.05, 0.02, 0.02] + ([0] * 12)
study_probs = [0.001, 0.02, 0.1, 0.25, 0.2, 0.15, 0.1, 0.08, 0.05] + ([0.049 / 6] * 6) + ([0] * 10)
game_probs = [0.2, 0.12, 0.18, 0.1] + ([0.4 / 10] * 10) + ([0] * 11)

### 函式：根據機率產生資料

In [7]:
def generate_data():
    sleep = np.random.choice(np.arange(data_range["Avg sleep time"][0], data_range["Avg sleep time"][1] + 1), p=sleep_probs)
    study = np.random.choice(np.arange(data_range["Avg study time"][0], data_range["Avg study time"][1] + 1), p=study_probs)
    game = np.random.choice(np.arange(data_range["Avg video game time"][0], data_range["Avg video game time"][1] + 1), p=game_probs)
    BMI = random.randint(data_range["BMI"][0], data_range["BMI"][1])
    relation = random.choice(data_range["In a relationship"])
    finalcial = random.choice(data_range["Family financial status"])
    GPA = np.random.choice(np.arange(data_range["GPA"][0], data_range["GPA"][1] + 0.1, 0.1), p=GPA_probs)
    laptop = random.choice(data_range["Laptop brand"])
    TOEIC = int(np.random.normal(500, 250, 1))
    if TOEIC % 5 != 0:
        TOEIC -= TOEIC % 5
    if TOEIC < 0:
        TOEIC = 0
    elif TOEIC > 990:
        TOEIC = 990
    IQ = np.random.choice(np.arange(data_range["IQ"][0], data_range["IQ"][1] + 1), p=IQ_probs)
    grade = None
    d = {}
    d["Avg sleep time"] = sleep
    d["Avg study time"] = study
    d["Avg video game time"] = game
    d["BMI"] = BMI
    d["In a relationship"] = relation
    d["Family financial status"] = finalcial
    d["GPA"] = GPA
    d["Laptop brand"] = laptop
    d["TOEIC grade"] = TOEIC
    d["IQ"] = IQ
    d["Grade"] = grade
    
    return d

### 函式：根據規則將資料歸類

In [8]:
def rules():
    d = generate_data()
    if d["IQ"] >= 150 or d["GPA"] >= 3.7:
        d["Grade"] = data_range["Grade"][0]
    elif d["Family financial status"] == "high" and d["Laptop brand"] == "Apple":
        d["Grade"] = data_range["Grade"][0]
    elif d["Avg sleep time"] <= 5 and d["Avg study time"] >= 8 and d["Family financial status"] in ["low", "avergae"]:
        d["Grade"] = data_range["Grade"][0]
    elif d["Avg study time"] >= 5 and d["In a relationship"] == "No":
        d["Grade"] = data_range["Grade"][1]
    elif d["TOEIC grade"] >= 800 and d["Avg sleep time"] <= 6:
        d["Grade"] = data_range["Grade"][1]
    elif d["Avg video game time"] >= 5:
        d["Grade"] = data_range["Grade"][2]
    else:
        d["Grade"] = random.choice(data_range["Grade"])
    return d

### 產生資料

In [9]:
df = pd.DataFrame(columns=attributes)

In [10]:
class_count = {}
class_count[data_range["Grade"][0]] = int(data_points_numbers * (1 / len(data_range["Grade"])))
class_count[data_range["Grade"][1]] = int(data_points_numbers * (1 / len(data_range["Grade"])))
class_count[data_range["Grade"][2]] = int(data_points_numbers - class_count[data_range["Grade"][0]] - class_count[data_range["Grade"][1]])

In [11]:
for _ in range(data_points_numbers):
    d = rules()
    stay = True
    while (stay):
        if (class_count[d["Grade"]] != 0):
            class_count[d["Grade"]] -= 1
            stay = False
        else:
            d = rules()
    s = pd.Series(d, index=attributes)
    df = df.append(s, ignore_index=True)

### 確認 attributes 的 datatype

In [12]:
numeric_attributes = ["Avg sleep time", "Avg study time", "Avg video game time", "TOEIC grade", "IQ"]
df[numeric_attributes] = df[numeric_attributes].apply(pd.to_numeric)
floating_attributes = ["BMI", "GPA"]
df[floating_attributes] = df[floating_attributes].astype('float64')
category_attributes = ["In a relationship", "Family financial status", "Laptop brand", "Grade"]
df[category_attributes] = df[category_attributes].astype('category')

In [13]:
df.dtypes

Avg sleep time                int64
Avg study time                int64
Avg video game time           int64
BMI                         float64
In a relationship          category
Family financial status    category
GPA                         float64
Laptop brand               category
TOEIC grade                   int64
IQ                            int64
Grade                      category
dtype: object

In [14]:
df

Unnamed: 0,Avg sleep time,Avg study time,Avg video game time,BMI,In a relationship,Family financial status,GPA,Laptop brand,TOEIC grade,IQ,Grade
0,7,3,11,32.0,Yes,average,3.8,Leveno,220,109,80+
1,5,2,2,29.0,Yes,average,2.0,HP,745,138,80+
2,7,3,2,30.0,No,average,2.7,Apple,330,109,60~80
3,7,3,1,26.0,No,low,4.0,Dell,575,103,80+
4,5,7,11,24.0,Yes,low,3.8,Apple,365,147,80+
5,7,5,6,24.0,Yes,average,2.4,Dell,660,136,60-
6,6,3,13,31.0,No,average,4.0,Asus,180,139,80+
7,4,12,1,35.0,No,high,4.0,Apple,685,141,80+
8,6,2,1,25.0,Yes,average,1.9,Dell,630,104,60~80
9,7,3,1,29.0,Yes,high,3.8,Acer,400,136,80+


In [15]:
df["Grade"].value_counts()

60-      334
80+      333
60~80    333
Name: Grade, dtype: int64

In [16]:
# !jupyter nbconvert --to script data_generator.ipynb