In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [None]:
def read_data(path):
    df = pd.read_csv(path, parse_dates=['FECHA'], infer_datetime_format=True)
    df['CPRECIO'] = df[' CPRECIO '].map(lambda x: x.strip().replace(",", ""))
    df['CPRECIO'] = df['CPRECIO'].convert_objects(convert_numeric=True)
    df['COSTOPESOS'] = df[' COSTOPESOS ']
    df = df.drop([' CPRECIO ', ' COSTOPESOS '], axis=1)
    cols = df.columns.values 
    cols[-3] = "YEAR"
    df.columns = cols
    return df

def read_test_data(path):
    df = pd.read_csv(path, parse_dates=['FECHA'], infer_datetime_format=True)
    df['CPRECIO'] = df[' CPRECIO ']
    df['COSTOPESOS'] = df[' COSTOPESOS ']
    df = df.drop([' CPRECIO ', ' COSTOPESOS '], axis=1)
    cols = df.columns.values 
    cols[-3] = "YEAR"
    df.columns = cols
    return df

def calculate_extra_cols(df):
    df['total_price'] =  df['CPRECIO'] * df['#UNIDADES'] * df['CTIPOCAM01']
    return df

df = read_data('./BASEVENTAS2010A2015.csv')
df = calculate_extra_cols(df)

# Cleanup all the spaces
df["MARCA"] = df["MARCA"].map(lambda x: x.strip())
df["IDPRODUCTO"] = df["IDPRODUCTO"].map(lambda x: x.strip())

In [None]:
subdf = df[["FOLIO_FACTURA", "IDPRODUCTO", "#UNIDADES"]]
subdf.info()

### Build Order to Product Matrix

In [4]:
order_prod = subdf.groupby(["FOLIO_FACTURA", "IDPRODUCTO"]).count().unstack()
order_prod = order_prod.fillna(0)
order_prod = order_prod["#UNIDADES"]

In [None]:
order_prod.shape

### For product 25967 lets see which products are usually bought together

In [None]:
order_prod[order_prod["25967"] == 1].sum(axis=0).sort_values()[:-10:-1]

### We create the pairwise distance between products and similarity matrix

In [8]:
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

similarities = cosine_similarity(order_prod.values.T)
distances = pairwise_distances(order_prod.values.T, metric="cosine")

In [None]:
# Confirm that we have the correct shape
print distances.shape, similarities.shape

In [10]:
# For speed and ease lets create a lookup dictionary for item indices
item_dict = {}
item_arr = order_prod.columns
for idx, product in enumerate(order_prod.columns):
    item_dict[product] = idx

In [None]:
item_dict.items()[:10]

### Lets get some indices for the products commonly bought for item: 25967

In [None]:
print item_dict["25967"], item_dict["62382"], item_dict["1668111"], item_dict["2966171"]

### Now lets get the row for item: 25967 with index: 4164

In [13]:
distance = distances[4164]
similarity = similarities[4164]

### And test agains the indices that we found earlier

In [None]:
print distance[7112], distance[2222], distance[5085], item_arr[7112]
print similarity[7112], similarity[2222], similarity[5085], item_arr[7112]