# Mô tả dữ liệu
- Dataset: [Dresses_Attribute_Sales](https://archive.ics.uci.edu/ml/datasets/Dresses_Attribute_Sales)
- Associated Tasks: Classification, Clustering

In [1]:
# import all necessary library
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import matplotlib.pyplot as plt

In [7]:
# initialize Spark
conf = (SparkConf()
         .setMaster("local[*]")
         .setAppName("Data description"))
spark = SparkContext(conf = conf)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=PySparkShell, master=local[*]) created by <module> at /Users/hongong/virtualenv/trustingsocial/lib/python2.7/site-packages/IPython/utils/py3compat.py:289 

In [None]:
# reading dataset to dataframe
schema = StructType([
    StructField("Dress_ID", StringType(), True),
    StructField("Style", StringType(), True),
    StructField("Price", StringType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Size", StringType(), True),
    StructField("Season", StringType(), True),
    StructField("NeckLine", StringType(), True),
    StructField("SleeveLength", StringType(), True),
    StructField("waiseline", StringType(), True),
    StructField("Material", StringType(), True),
    StructField("FabricType", StringType(), True),
    StructField("Decoration", StringType(), True),
    StructField("Pattern Type", StringType(), True),
    StructField("Recommendation", IntegerType(), True)])

df = spark.read \
        .schema(schema) \
        .format("com.databricks.spark.csv") \
        .option("header", "true") \
        .load("Dresses_Attribute_Sales.csv")

df.show(5)

# Các loại dữ liệu
- Nomial
- Numbers
- Odinal
- Ratio

In [None]:
# Nomial
df_nomial = df[["Style", "NeckLine", "Material", "Pattern Type"]]
df_nomial.show(5)

In [None]:
# Numbers
df_numbers = df[["Dress_ID"]]
df_numbers.show(5)

In [None]:
# Ordinal
df_ordinal = df[["Size"]]
df_ordinal.show(5)

In [None]:
# ratio
df_ratio = df[["Rating"]]
df_ratio.show(5)

# Trung tâm dữ liệu
- Trung bình (mean)
$$\mu = \bar{x} = \frac{1}{n} \sum_{i=1}^n x_i = \frac{1}{n} (x_1 + ... + x_n)$$
- Trung vị (median)
- Mode

In [None]:
df_nomial.describe().show()

In [None]:
df_numbers.describe().show()

In [None]:
df_ordinal.describe().show()

In [None]:
df_ratio.describe().show()

In [None]:
mean_rating = df_ratio.agg(F.mean(df_ratio.Rating)).first()[0]
print "Mean rating:", mean_rating

In [None]:
sqlContext.registerDataFrameAsTable(df_ratio, "df_ratio")

median_rating = sqlContext.sql("""
    SELECT percentile(Rating, 0.5) AS median_rating 
    FROM df_ratio
""").first()["median_rating"]

print "Median rating:", median_rating

In [None]:
counts = df_ratio.groupBy("Rating").count()

mode_rating = counts.join(
        counts.agg(F.max('count').alias('count')),
        on='count'
    ).limit(1).select("Rating").first()["Rating"]

print "Mode rating:", mode_rating

In [None]:
# visualize price column
fig, ax = plt.subplots(figsize=(15, 5))
ax.axvline(mean_rating, color='red', linewidth=5)
ax.axvline(median_rating, color='green', linewidth=5)
ax.axvline(mode_rating, color='blue', linewidth=5)

# Add arrows annotating the means:
def add_arrow(label, val, align="left"):
    ax.annotate(label + ': {:0.2f}'.format(val), xy=(val, 1), xytext=(15, 15),
            xycoords=('data', 'axes fraction'), textcoords='offset points',
            horizontalalignment=align, verticalalignment='center',
            arrowprops=dict(arrowstyle='-|>', fc='black', shrinkA=0, shrinkB=0,
                            connectionstyle='angle,angleA=0,angleB=90,rad=10'),
            )

add_arrow("Mean", mean_rating)
add_arrow("Median", median_rating)
add_arrow("Mode", mode_rating)
ax.legend(loc='upper left')
ax.margins(0.05)

bins, hist = df_ratio.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist = np.asarray(hist)
bins = np.asarray(bins)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.title("Rating Histogram")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

# Biến đổi của dữ liệu 
- Khoảng đoạn (range)
$$range = maxValue - minValue$$

- Phương sai (variance)
$$\sigma^2 = \frac{\sum_{i=1}^n (x_i - \mu)}{n}$$

- Độ lệch chuẩn (standard deviation)
$$\sigma = \sqrt{\frac{\sum_{i=1}^n (x_i - \mu)}{n}}$$

- Z-score: biến đổi từ sample mean để thực hiện Z-test
    * Shift trung bình mẫu về 0 bằng $X - \mu$
    * Nén độ lệch chuẩn của mẫu ban đầu lại bằng cách chia cho $\sigma$
$$Z = \frac{X - \mu}{\sigma}$$

- Phân vị (percentile)
$$percentile \ of \ x = \frac{No. value \ below \ x}{n} * 100\\$$
$$quartiles = \frac{percentile * n}{100}$$

In [None]:
# range of rating
min_rating = df_ratio.agg(F.min(df_ratio.Rating)).first()[0]
max_rating = df_ratio.agg(F.max(df_ratio.Rating)).first()[0]
range_rating = max_rating - min_rating
print "Min rating:", min_rating
print "Max rating:", max_rating
print "Rating range:", range_rating

In [None]:
# variance of rating
var_rating = df_ratio.agg(F.variance(df_ratio.Rating)).first()[0]
print "Rating variance:", var_rating

In [None]:
# standard deviation of rating
std_rating = df_ratio.agg(F.stddev(df_ratio.Rating)).first()[0]
print "Rating standard deviation:", std_rating

In [None]:
# z-score of rating
df_z_score_rating = sqlContext.sql("SELECT (Rating - " + \
                                   str(mean_rating) + " / " + str(std_rating) + \
                                   ") as Rating FROM df_ratio")               

print "\nZ-score of rating:"
df_z_score_rating.show(5)

bins, hist = df_z_score_rating.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist = np.asarray(hist)
bins = np.asarray(bins)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2

# plotting
fig = plt.figure(figsize=(15, 5), dpi= 80, facecolor='w', edgecolor='k')
plt.bar(center, hist, align='center', width=width)
plt.title("Z score distribution")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

# Xác suất
- Xác suất (probability)
$$probability = \frac{event(s)}{outcome(s)}$$

- Phép đếm hoán vị (permutation)
$$P(n, r) = \frac{n!}{(n - r)!}\\$$
$$n: distinct\ object\ to\ choose\ from$$
$$r: spaces\ to\ fill.$$

- Phép đếm tổ hợp (combination)
$$C(n, r) = \frac{n!}{r!(n - r)!}$$

- Xác suất có điều kiện (conditional probability)
$$P(B|A) = \frac{P(A \cap B)}{P(A)}$$

- Biến độc lập và biến phụ thuộc (independent/dependent variable)
    - Independent variable: 
        * Dress_ID
        * Style
        * Price
        * Rating
        * Size
        * Season
        * NeckLine
        * SleeveLength
        * waiseline
        * Material
        * FabricType
        * Decoration
        * Pattern Type
    - Dependent variable: Recommendation

- Bayes
$$P(A|B) = \frac{P(B|A) P(A)}{P(B)}\\$$
$$Posterior = \frac{Likelihood * Prior}{Evidence}$$

In [None]:
# probability to get rating 4.0
# counter=Counter(df_ratio["Rating"])
# prob_of_4 = counter[4.0] * 100.0 / len(counter)
# print "Probability to get rating 4.0:", prob_of_4, "%"

bins, hist = df_z_score_rating.select("Rating").rdd.flatMap(lambda x: x).histogram(70)
hist