In [3]:
from enum import Enum


class ZYJStatus(Enum):
    SUCCESS = 'success'
    SKIPPED = 'skipped'


class ZCols(Enum):
    # input table column names
    REVIEW_CONTENT = 'content'
    RATING = 'score'
    REVIEW_ID = 'id'

    # derived column names
    LONG_ENOUGH = 'long_enough'
    CHN_STR_LEN = 'char_length'
    CHN_ONLY_REVIEW = 'chn_only_review'

    # output table column names
    NEG_REVIEW = 'negative_reviews'
    POS_REVIEW = 'positive_reviews'
    PROD_ID = 'prod_id'
    VALID_NED_REVIEW_ID = 'valid_negative_reviews_id'
    VALID_POS_REVIEW_ID = 'valid_positive_reviews_id'
    POS_TAGS = 'positive_tags'
    NEG_TAGS = 'negative_tags'
    TAGS = 'tags'
    SUMMARY = 'summary'
    NUM_VALID_REVIEWS = 'num_valid_reviews'
    NUM_TOTAL_REVIEWS = 'num_total_reviews'
    PERCENT_QUALITY_REVIEWS = 'percent_quality_reviews'

def chn_char_only(input_str):
    pttn = "[A-Za-z0-9\[\`\~\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\，。∶'\,\[\]\.\<\>\/\?\~\@\#\\\&\*\%]"
    try:
        str_out = re.sub(pttn, " ", input_str).strip()
    except TypeError:
        logger.error("TypeError encountered: {}".format(input_str))
        str_out = ""
    return str_out

def top_tags(review_contents, top_n):
    tags = review_contents[ZCols.CHN_ONLY_REVIEW].apply(
        lambda review_x: jieba.analyse.extract_tags(review_x, topK=top_n)).tolist()
    flat_list_tags = [item for sublist in tags for item in sublist]
    common_tags = Counter(flat_list_tags).most_common()
    top_n = min(top_n, len(common_tags))
    most_common_tags = common_tags[0: top_n - 1]
    try:
        (x, _) = zip(*most_common_tags)
    except ValueError:
        x = list()
    return x

def pre_process(df_input, len_threshold):
    df_input[ZCols.CHN_ONLY_REVIEW] = df_input[ZCols.REVIEW_CONTENT].apply(lambda x: chn_char_only(x))
    df_input[ZCols.CHN_STR_LEN] = df_input[ZCols.CHN_ONLY_REVIEW].apply(lambda x: len(x))
    df_input[ZCols.LONG_ENOUGH] = df_input[ZCols.CHN_STR_LEN].apply(lambda x: x > len_threshold)
    df_processed = df_input[df_input[ZCols.LONG_ENOUGH]][[ZCols.CHN_ONLY_REVIEW, ZCols.RATING, ZCols.REVIEW_ID]].copy()
    return df_processed

def get_list_of_files(input_path):
    in_path = Path(input_path)
    try:
        abs_path = in_path.resolve()
    except FileNotFoundError:
        logger.critical("Input path does not exist, exit.")
        sys.exit()
    files = []
    if abs_path.is_dir():
        dirs = [x for x in abs_path.iterdir() if x.is_dir()]
        file_sub_depth1 = [y for y in abs_path.iterdir() if y.is_file()]
        files.extend(file_sub_depth1)
        for directory in dirs:
            files_sub = [z for z in directory.iterdir() if z.is_file()]
            files.extend(files_sub)
    else:
        files.append(abs_path)
    return files

In [4]:
from __future__ import print_function
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from collections import Counter
import logging.config
import json
import codecs
from pathlib import Path

f_list = get_list_of_files(r"/Users/xma/Documents/PyProjects/data/product_1315_1343_9710")

print(f_list)

[PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/.csv'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/1315_1343_9710.jl'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/1315_1343_9710.status'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/1315_1343_9710_comment.jl'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/1315_1343_9710_comment.status'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/status.json'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/10005573212/10005573212.csv'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/10005573212/10005573212.jl'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/10005573212/10005573212.status'), PosixPath('/Users/xma/Documents/PyProjects/data/product_1315_1343_9710/10005573212/10005573212_meta.json'), PosixPath('/Users/xma/Docu

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from pandas.parser import CParserError, EmptyDataError
def read_as_pandas(file_path):
    try:
        df_raw = pd.read_csv(file_path, index_col=None, sep="\t")
    except CParserError:
        df_raw = pd.DataFrame()
    except EmptyDataError:
        df_raw = pd.DataFrame()
    return df_raw

In [8]:
def chn_char_only(input_str):
    pttn = "[A-Za-z0-9\[\`\~\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\，。∶'\,\[\]\.\<\>\/\?\~\@\#\\\&\*\%]"
    try:
        str_out = re.sub(pttn, " ", input_str).strip()
    except TypeError:
        logger.error("TypeError encountered: {}".format(input_str))
        str_out = ""
    return str_out

def pre_process(self, df_input, len_threshold):
    df_input[ZCols.CHN_ONLY_REVIEW] = df_input[ZCols.REVIEW_CONTENT].apply(lambda x: self.chn_char_only(x))
    df_input[ZCols.CHN_STR_LEN] = df_input[ZCols.CHN_ONLY_REVIEW].apply(lambda x: len(x))
    df_input[ZCols.LONG_ENOUGH] = df_input[ZCols.CHN_STR_LEN].apply(lambda x: x > len_threshold)
    df_processed = df_input[df_input[ZCols.LONG_ENOUGH]][[ZCols.CHN_ONLY_REVIEW, ZCols.RATING, ZCols.REVIEW_ID]].copy()
    return df_processed

TypeError: 'generator' object is not subscriptable

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba

fake_reviews_path = r"/Users/xma/Desktop/taobao"
df_fake_reviews = read_as_pandas(fake_reviews_path)
print(df_fake_reviews['review'])

doc_word_split = df_long_reviews['review'].apply(lambda x: " ".join(jieba.cut(x, cut_all=False)))
print(doc_word_split)

ModuleNotFoundError: No module named 'jieba'