Skip to content

Commit

Permalink
Merge b285cb6 into c0dfa3b
Browse files Browse the repository at this point in the history
  • Loading branch information
ljchang committed Jun 25, 2018
2 parents c0dfa3b + b285cb6 commit 813c401
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 54 deletions.
4 changes: 4 additions & 0 deletions .pytest_cache/v/cache/lastfailed
@@ -0,0 +1,4 @@
{
"emotioncf/tests/test_core.py::test_cf_mean": true,
"emotioncf/tests/test_core.py::test_cf_nnmf_multiplicative": true
}
9 changes: 9 additions & 0 deletions .pytest_cache/v/cache/nodeids
@@ -0,0 +1,9 @@
[
"emotioncf/tests/test_core.py::test_create_sub_by_item_matrix",
"emotioncf/tests/test_core.py::test_cf_mean",
"emotioncf/tests/test_core.py::test_cf_knn",
"emotioncf/tests/test_core.py::test_cf_knn_dil",
"emotioncf/tests/test_core.py::test_cf_nnmf_multiplicative",
"emotioncf/tests/test_core.py::test_cf_nnmf_sgd",
"emotioncf/tests/test_core.py::test_downsample"
]
91 changes: 46 additions & 45 deletions emotioncf/cf.py
Expand Up @@ -21,7 +21,7 @@ class BaseCF(object):

def __init__(self, ratings, mask=None, n_train_items=None):
if not isinstance(ratings, pd.DataFrame):
raise ValueError('ratings must be a pandas dataframe instance')
raise ValueError('ratings must be a pandas dataframe instance')
self.ratings = ratings
self.predicted_ratings = None
self.is_fit = False
Expand All @@ -47,7 +47,7 @@ def __repr__(self):
def get_mse(self, data='all'):

''' Get overall mean squared error for predicted compared to actual for all items and subjects. '''

if not self.is_fit:
raise ValueError('You must fit() model first before using this method.')
if not self.is_predict:
Expand Down Expand Up @@ -149,7 +149,7 @@ def split_train_test(self, n_train_items=20):
n_train_items: (int) number of items for test dictionary or list of specific items
'''

self.n_train_items = int(n_train_items)
self.train_mask = self.ratings.copy()
self.train_mask.loc[:,:] = np.zeros(self. ratings.shape).astype(bool)
Expand All @@ -164,12 +164,12 @@ def plot_predictions(self):
''' Create plot of actual and predicted ratings'''

import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
if not self.is_fit:
raise ValueError('You must fit() model first before using this method.')
if not self.is_predict:
raise ValueError('You must predict() model first before using this method.')

if self.is_mask:
f, ax = plt.subplots(nrows=1,ncols=3, figsize=(15,8))
else:
Expand Down Expand Up @@ -205,12 +205,12 @@ def plot_predictions(self):
def downsample(self, sampling_freq=None, target=None, target_type='samples'):

''' Downsample rating matrix to a new target frequency or number of samples using averaging.
Args:
sampling_freq: Sampling frequency of data
sampling_freq: Sampling frequency of data
target: downsampling target
target_type: type of target can be [samples,seconds,hz]
'''

if sampling_freq is None:
Expand All @@ -237,20 +237,20 @@ def ds(ratings, sampling_freq=sampling_freq, target=None, target_type='samples')
idx = np.concatenate([idx, np.repeat(idx[-1]+1,ratings.shape[0]-len(idx))])
return ratings.groupby(idx).mean().T

self.ratings = ds(self.ratings, sampling_freq=sampling_freq, target=target,
self.ratings = ds(self.ratings, sampling_freq=sampling_freq, target=target,
target_type=target_type)

if self.is_mask:
self.train_mask = ds(self.train_mask, sampling_freq=sampling_freq,
self.train_mask = ds(self.train_mask, sampling_freq=sampling_freq,
target=target, target_type=target_type)
self.train_mask.loc[:,:] = self.train_mask>0

if self.is_predict:
self.predicted_ratings = ds(self.predicted_ratings,
self.predicted_ratings = ds(self.predicted_ratings,
sampling_freq=sampling_freq, target=target, target_type=target_type)

def to_long_df(self):

''' Create a long format pandas dataframe with observed, predicted, and mask.'''

observed = pd.DataFrame(columns=['Subject','Item','Rating','Condition'])
Expand Down Expand Up @@ -281,7 +281,7 @@ def to_long_df(self):
def _conv_ts_mean_overlap(self, sub_rating, n_samples=5):

'''Dilate each rating by n samples (centered). If dilated samples are overlapping they will be averaged.
Args:
sub_rating: vector of ratings for subject
n_samples: number of samples to dilate each rating
Expand All @@ -305,7 +305,7 @@ def _conv_ts_mean_overlap(self, sub_rating, n_samples=5):

def _dilate_ts_rating_samples(self, n_samples=None):

''' Helper function to dilate sparse time-series ratings by n_samples.
''' Helper function to dilate sparse time-series ratings by n_samples.
Overlapping ratings will be averaged
Args:
Expand All @@ -314,15 +314,18 @@ def _dilate_ts_rating_samples(self, n_samples=None):
Returns:
masked_ratings: pandas ratings instance that has been dilated by n_samples
'''

if n_samples is None:
raise ValueError('Please specify number of samples to dilate.')

if not self.is_mask:
raise ValueError('Make sure cf instance has been masked.')

masked_ratings = self.ratings[self.train_mask]
return masked_ratings.apply(lambda x: self._conv_ts_mean_overlap(x, n_samples=n_samples), axis=1)
return masked_ratings.apply(lambda x: self._conv_ts_mean_overlap(x,
n_samples=n_samples),
axis=1,
result_type='broadcast')

class Mean(BaseCF):

Expand All @@ -338,12 +341,12 @@ def fit(self, dilate_ts_n_samples=None):
Args:
metric: type of similarity {"correlation","cosine"}
dilate_ts_n_samples: will dilate masked samples by n_samples to leverage auto-correlation
dilate_ts_n_samples: will dilate masked samples by n_samples to leverage auto-correlation
in estimating time-series ratings
'''

if self.is_mask:
if self.is_mask:
if dilate_ts_n_samples is None:
self.mean = self.ratings[self.train_mask].mean(skipna=True, axis=0)
else:
Expand Down Expand Up @@ -386,12 +389,12 @@ def fit(self, metric='pearson', dilate_ts_n_samples=None):
Args:
metric: type of similarity {"pearson",,"spearman","correlation","cosine"}. Note pearson and spearman are way faster.
dilate_ts_n_samples: will dilate masked samples by n_samples to leverage auto-correlation
dilate_ts_n_samples: will dilate masked samples by n_samples to leverage auto-correlation
in estimating time-series ratings
'''

if self.is_mask:
if self.is_mask:
if dilate_ts_n_samples is None:
ratings = self.ratings[self.train_mask]
else:
Expand All @@ -411,9 +414,9 @@ def cosine_similarity(x,y):
for x in ratings.iterrows():
for y in ratings.iterrows():
if metric is 'correlation':
sim.loc[x[0],y[0]] = pearsonr(x[1][(~x[1].isnull()) & (~y[1].isnull())],y[1][(~x[1].isnull()) & (~y[1].isnull())])[0]
sim.loc[x[0],y[0]] = pearsonr(x[1][(~x[1].isnull()) & (~y[1].isnull())],y[1][(~x[1].isnull()) & (~y[1].isnull())])[0]
elif metric is 'cosine':
sim.loc[x[0],y[0]] = cosine_similarity(x[1][(~x[1].isnull()) & (~y[1].isnull())],y[1][(~x[1].isnull()) & (~y[1].isnull())])
sim.loc[x[0],y[0]] = cosine_similarity(x[1][(~x[1].isnull()) & (~y[1].isnull())],y[1][(~x[1].isnull()) & (~y[1].isnull())])
else:
raise NotImplementedError("%s is not implemented yet. Try ['pearson','spearman','correlation','cosine']" % metric )
self.subject_similarity = sim
Expand Down Expand Up @@ -448,24 +451,24 @@ def predict(self, k=None):
self.is_predict = True

class NNMF_multiplicative(BaseCF):
''' Train non negative matrix factorization model using multiplicative updates.
''' Train non negative matrix factorization model using multiplicative updates.
Allows masking to only learn the training weights.
Based on http://stackoverflow.com/questions/22767695/
python-non-negative-matrix-factorization-that-handles-both-zeros-and-missing-dat
'''

def __init__(self, ratings, mask=None, n_train_items=None):
super(NNMF_multiplicative, self).__init__(ratings, mask, n_train_items)
self.H = None
self.W = None

def fit(self,
n_factors = None,
n_factors = None,
max_iterations = 100,
error_limit = 1e-6,
fit_error_limit = 1e-6,
error_limit = 1e-6,
fit_error_limit = 1e-6,
verbose = False,
dilate_ts_n_samples = None):

Expand All @@ -477,7 +480,7 @@ def fit(self,
error_limit (float): error tolerance (default=1e-6)
fit_error_limit (float): fit error tolerance (default=1e-6)
verbose (bool): verbose output during fitting procedure (default=True)
dilate_ts_n_samples (int): will dilate masked samples by n_samples to leverage auto-correlation
dilate_ts_n_samples (int): will dilate masked samples by n_samples to leverage auto-correlation
in estimating time-series ratings
'''
Expand All @@ -492,7 +495,7 @@ def fit(self,
avg = np.sqrt(np.nanmean(self.ratings)/n_factors)
self.H = avg*np.random.rand(n_items, n_factors) # H = Y
self.W = avg*np.random.rand(n_users, n_factors) # W = A

if self.is_mask:
if dilate_ts_n_samples is None:
mask = self.train_mask.values
Expand Down Expand Up @@ -550,21 +553,21 @@ def predict(self):
self.is_predict = True

class NNMF_sgd(BaseCF):
''' Train non negative matrix factorization model using stochastic gradient descent.
''' Train non negative matrix factorization model using stochastic gradient descent.
Allows masking to only learn the training weights.
This code is based off of Ethan Rosenthal's excellent tutorial
This code is based off of Ethan Rosenthal's excellent tutorial
on collaborative filtering https://blog.insightdatascience.com/
explicit-matrix-factorization-als-sgd-and-all-that-jazz-b00e4d9b21ea#.kkr7mzvr2
'''

def __init__(self, ratings, mask=None, n_train_items=None):
super(NNMF_sgd, self).__init__(ratings, mask, n_train_items)

def fit(self,
n_factors=None,
item_fact_reg=0.0,
def fit(self,
n_factors=None,
item_fact_reg=0.0,
user_fact_reg=0.0,
item_bias_reg=0.0,
user_bias_reg=0.0,
Expand All @@ -581,7 +584,7 @@ def fit(self,
error_limit (float): error tolerance (default=1e-6)
fit_error_limit (float): fit error tolerance (default=1e-6)
verbose (bool): verbose output during fitting procedure (default=True)
dilate_ts_n_samples (int): will dilate masked samples by n_samples to leverage auto-correlation
dilate_ts_n_samples (int): will dilate masked samples by n_samples to leverage auto-correlation
in estimating time-series ratings
'''
Expand All @@ -606,7 +609,7 @@ def fit(self,
sample_row, sample_col = ratings.values.nonzero()
self.global_bias = self.ratings[~self.ratings.isnull()].mean().mean()

# initialize latent vectors
# initialize latent vectors
self.user_vecs = np.random.normal(scale=1./n_factors, size=(n_users, n_factors))
self.item_vecs = np.random.normal(scale=1./n_factors, size=(n_items, n_factors))

Expand All @@ -633,11 +636,11 @@ def fit(self,
prediction = self._predict_single(u,i)

e = (ratings.iloc[u,i] - prediction) # error

# Update biases
self.user_bias[u] += (learning_rate * (e - self.user_bias_reg * self.user_bias[u]))
self.item_bias[i] += (learning_rate * (e - self.item_bias_reg * self.item_bias[i]))

# Update latent factors
self.user_vecs[u, :] += (learning_rate * (e * self.item_vecs[i, :] - self.user_fact_reg * self.user_vecs[u,:]))
self.item_vecs[i, :] += (learning_rate * (e * self.user_vecs[u, :] - self.item_fact_reg * self.item_vecs[i,:]))
Expand Down Expand Up @@ -666,5 +669,3 @@ def _predict_single(self, u, i):
prediction = self.global_bias + self.user_bias[u] + self.item_bias[i]
prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
return prediction


2 changes: 1 addition & 1 deletion emotioncf/version.py
@@ -1 +1 @@
__version__ = '0.0.1'
__version__ = '0.0.2'
4 changes: 2 additions & 2 deletions requirements.txt
@@ -1,5 +1,5 @@
numpy
pandas
pandas >= 0.23.0
scipy
matplotlib
seaborn
seaborn
25 changes: 19 additions & 6 deletions setup.py
@@ -1,21 +1,34 @@
from emotioncf.version import __version__
from setuptools import setup, find_packages

version = {}
with open("emotioncf/version.py") as f:
exec(f.read(), version)

with open('requirements.txt') as f:
requirements = f.read().splitlines()

extra_setuptools_args = dict(
tests_require=['pytest']
)

setup(
name="emotioncf",
version=__version__,
description="Emotion Rating Collaborative Filtering",
description='A Python package for performing Collaborative Filtering on ',
'sparse emotion ratings",
maintainer='Luke Chang',
maintainer_email='luke.j.chang@dartmouth.edu',
url='http://github.com/ljchang/emotionCF',
install_requires=['numpy', 'scipy', 'pandas', 'matplotlib', 'seaborn'],
install_requires=requirements,
packages=find_packages(exclude=['emotioncf/tests']),
license='MIT',
# download_url='https://github.com/ljchang/emotionCF/archive/%s.tar.gz' %
# __version__,
keywords = ['emotion', 'collaborative filtering', 'recommender','machine-learning'],
classifiers = [
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.6",
"Operating System :: OS Independent",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License"
],
**extra_setuptools_args
)
)

0 comments on commit 813c401

Please sign in to comment.