In [164]:
import pandas as pd
import numpy as np
import scipy
import os
import os.path
from matplotlib import cm
from IPython.display import display, HTML

import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import calendar

df = pd.read_csv("Data/ava.csv")

In [165]:
df = df.drop(["Unnamed: 0"], axis = 1)
df.head()

Unnamed: 0,user,first_date,last_date,cycle_number,cycle_length,date,cycle_day,skin_temperature,hr,br
0,359,2016-11-12,2016-12-06,1,25,2016-11-12,1,36.11,68.375941,16.081958
1,359,2016-11-12,2016-12-06,1,25,2016-11-13,2,35.7,67.499289,15.681856
2,359,2016-11-12,2016-12-06,1,25,2016-11-14,3,35.38,63.63292,15.381588
3,359,2016-11-12,2016-12-06,1,25,2016-11-15,4,35.64,66.366427,15.448914
4,359,2016-11-12,2016-12-06,1,25,2016-11-16,5,35.87,66.68099,15.823407


In [166]:
#Create a new function:
def num_missing(x):
    return sum(x.isnull())

#Applying per column:
print("Missing values per column:")
print(df.apply(num_missing, axis=0)) #axis=0 defines that function is to be applied on each column

#Applying per row:
#print("\nMissing values per row:")
#print(df.apply(num_missing, axis=1).head()) #axis=1 defines that function is to be applied on each row

Missing values per column:
user                 0
first_date           0
last_date            0
cycle_number         0
cycle_length         0
date                 0
cycle_day            0
skin_temperature    16
hr                  16
br                   9
dtype: int64


### work only without NaNs

In [167]:
df = df.dropna(axis=0, how='all')

### normalisation of features

In [168]:
df["st_norm"] = (df["skin_temperature"]-df["skin_temperature"].mean())\
    /df["skin_temperature"].std()

In [169]:
df["hr_norm"] = (df["hr"]-df["hr"].mean())\
                    /df["hr"].std()

In [170]:
df["br_norm"] = (df["br"]-df["br"].mean())\
                    /df["br"].std()

In [171]:
### min and max of one cycle

In [172]:
print("number of users: ", len(df["user"].unique()))
print("number of cycles: ", len(df.groupby(["user","cycle_number"]).describe()))

number of users:  180
number of cycles:  792


data.loc[(data["Gender"]=="Female") & (data["Education"]=="Not Graduate") & (data["Loan_Status"]=="Y"), ["Gender","Education","Loan_Status"]]

In [204]:
print("unique cycle lengths: ", sorted(df["cycle_length"].unique()))
print("number of unique cycle lengths: ", len(df["cycle_length"].unique()))
print()
print("regular cycle lengths between 20 and 36 days")

unique cycle lengths:  [11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 47, 48, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 73, 92, 98, 107, 112, 123, 124, 167, 228, 245, 300]
number of unique cycle lengths:  59

regular cycle lengths between 20 and 36 days


### Dataframe with cycles of regular length only

In [205]:
df_regLen = df.loc[(df.cycle_length >= 20)&(df.cycle_length <= 36)]
print("unique cycle lengths in regular df: ", sorted(df_regLen["cycle_length"].unique()))
print("number of cycles in regular df: ", len(df_regLen.groupby(["user","cycle_number"]).describe()))

unique cycle lengths in regular df:  [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]
number of cycles in regular df:  717


### Dataframe with cycles of regular length only and without missing days

(if cycle_length value > count of cycle_days in one cycle)

In [206]:
groupby_cycles = df_regLen.groupby(["user","cycle_number"]).describe()

In [207]:
groupby_cycles = pd.DataFrame(groupby_cycles[["cycle_day", "cycle_length"]]).reset_index()

In [208]:
print("number of incomplete cycles (with missing days): ", len(groupby_cycles[groupby_cycles.cycle_day["count"] != groupby_cycles.cycle_length["mean"]]))

number of incomplete cycles (with missing days):  542


In [209]:
print("number of complete cycles (without missing days): ",len(groupby_cycles[groupby_cycles.cycle_day["count"] == groupby_cycles.cycle_length["mean"]]))
print("number users with complete cycles: ",len(groupby_cycles[groupby_cycles.cycle_day["count"] == groupby_cycles.cycle_length["mean"]]))

number of complete cycles (without missing days):  175
number users with complete cycles:  175


In [210]:
index_completeCycles = groupby_cycles[groupby_cycles.cycle_day["count"] == groupby_cycles.cycle_length["mean"]].index
gold_df = groupby_cycles.iloc[index_completeCycles][["user", "cycle_number"]]

# user and users cycles with complete data and regular cycle length
print(gold_df.user.values)
print(len(gold_df.user.unique()))

[  360   360   406   406   406   406   406   406   407   407   407   407
   407   407   411   411   411   411   411   417   417   417   417   417
   418   418   418   418   423   423   423   423   427   427   427   436
   436   447   466   466   466   487   500   500   500   500   513   513
   513   513   513   827   827   827   827   827   912   912   917   939
   939   950   950  1316  1320  2559  2559  2560  2741  2741  2746  2746
  2760  2760  2760  2767  2768  2787  2787  2787  2787  2787  3775  3775
  3775  3994  3994  4001  4025  4039  4039  4039  4055  4057  4070  4070
  4070  4070  4070  4070  4070  4070  4569  4569  4569  4569  4570  4577
  4587  4587  4872  4874  4874  4881  4881  4881  4919  4925  4925  4927
  4927  4930  4930  4940  4940  4940  4940  4942  4942  4942  4942  5184
  5200  5200  5203  5206  5206  5213  5214  5214  5214  5214  5448  5448
  5448  5533  5539  5539  5547  5547  5556  5559  5559  5559  5560  5792
  7959  8976 13849 14026 14027 14027 14027 14032 14

In [217]:
# time series data frame with complete cycles and regular length
gold = pd.DataFrame()
for user in np.unique(gold_df.user.values):
    for cycle in gold_df[gold_df.user == user].cycle_number.values:
        #print("user: ", user)
        #print("cycle: ", cycle)
        gold = gold.append(df_regLen[(df_regLen["user"] == user)&(df_regLen["cycle_number"] == cycle)])
        #print(gold[["user", "cycle_number", "date"]])

In [212]:
gold2 = gold.reset_index()

In [213]:
#gold2["cycle_index"]

In [218]:
gold.to_csv("Data/data_clean.csv")

In [215]:
print(gold[gold.user == 360])

     user  first_date   last_date  cycle_number  cycle_length        date  \
56    360  2016-09-08  2016-10-11             1            34  2016-09-08   
57    360  2016-09-08  2016-10-11             1            34  2016-09-09   
58    360  2016-09-08  2016-10-11             1            34  2016-09-10   
59    360  2016-09-08  2016-10-11             1            34  2016-09-11   
60    360  2016-09-08  2016-10-11             1            34  2016-09-12   
61    360  2016-09-08  2016-10-11             1            34  2016-09-13   
62    360  2016-09-08  2016-10-11             1            34  2016-09-14   
63    360  2016-09-08  2016-10-11             1            34  2016-09-15   
64    360  2016-09-08  2016-10-11             1            34  2016-09-16   
65    360  2016-09-08  2016-10-11             1            34  2016-09-17   
66    360  2016-09-08  2016-10-11             1            34  2016-09-18   
67    360  2016-09-08  2016-10-11             1            34  2016-09-19   

## Feature engineering

In [138]:
br_max = gold.groupby(["user","cycle_number"]).describe()["br"]["max"]
br_min = gold.groupby(["user","cycle_number"]).describe()["br"]["min"]

In [148]:
features = br_max

features = features.reset_index()
features = pd.DataFrame(features)
features["br_max"] = features["max"]
features = features.drop(["max"], axis = 1)
features["br_min"] = br_min.values
features

Unnamed: 0,user,cycle_number,br_max,br_min
0,360,1,14.211006,13.081668
1,360,3,14.132796,12.881180
2,406,5,15.032636,14.642394
3,406,6,15.306203,14.490755
4,406,8,15.281033,14.390497
5,406,11,15.688719,14.407575
6,406,12,15.500160,14.523382
7,406,13,15.497435,14.457574
8,407,3,14.940587,13.879623
9,407,4,15.085202,13.892877


In [149]:
from sklearn import cluster
hclust = cluster.AgglomerativeClustering(n_clusters=4, affinity="euclidean", linkage="ward")

In [152]:
hclust = hclust.fit(features[["br_min", "br_max"]])
hclust.

#### Example
http://scikit-learn.org/stable/auto_examples/cluster/plot_digits_linkage.html#sphx-glr-auto-examples-cluster-plot-digits-linkage-py

In [158]:
import numpy as np
from scipy import ndimage
from matplotlib import pyplot as plt

from sklearn import manifold, datasets

def plot_clustering(X_red, X, labels, title=None):
    x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
    X_red = (X_red - x_min) / (x_max - x_min)

    plt.figure(figsize=(6, 4))
    for i in range(X_red.shape[0]):
        plt.text(X_red[i, 0], X_red[i, 1], str(y[i]),
                 color=plt.cm.spectral(labels[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    plt.xticks([])
    plt.yticks([])
    if title is not None:
        plt.title(title, size=17)
    plt.axis('off')
    plt.tight_layout()

#----------------------------------------------------------------------
# 2D embedding of the digits dataset
print("Computing embedding")
X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(features)
y = features
print("Done.")

from sklearn.cluster import AgglomerativeClustering

for linkage in ('ward', 'average', 'complete'):
    clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)
    clustering.fit(X_red)
    plot_clustering(X_red, features, clustering.labels_, "%s linkage" % linkage)


plt.show()

Computing embedding
Done.




KeyError: 0

<matplotlib.figure.Figure at 0x1170b98d0>