In [None]:
#矩阵补全
import xlrd
import numpy as np
import pandas as pd


class MF():
    def __init__(self, X, k, alpha, beta, iterations):
        """
        Perform matrix factorization to predict np.nan entries in a matrix.
        Arguments
        - X (ndarray)   : sample-feature matrix
        - k (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """

        self.X = X
        self.num_samples, self.num_features = X.shape
        self.k = k
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        # True if not nan
        self.not_nan_index = (np.isnan(self.X) == False)

    def train(self):
        # Initialize factorization matrix U and V
        self.U = np.random.normal(scale=1./self.k, size=(self.num_samples, self.k))
        self.V = np.random.normal(scale=1./self.k, size=(self.num_features, self.k))

        # Initialize the biases
        self.b_u = np.zeros(self.num_samples)
        self.b_v = np.zeros(self.num_features)
        self.b = np.mean(self.X[np.where(self.not_nan_index)])
        # Create a list of training samples
        self.samples = [
            (i, j, self.X[i, j])
            for i in range(self.num_samples)
            for j in range(self.num_features)
            if not np.isnan(self.X[i, j])
        ]

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            # total square error
            se = self.square_error()
            training_process.append((i, se))
            if (i+1) % 10 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, se))

        return training_process

    def square_error(self):
        """
        A function to compute the total square error
        """
        predicted = self.full_matrix()
        error = 0
        for i in range(self.num_samples):
            for j in range(self.num_features):
                if self.not_nan_index[i, j]:
                    error += pow(self.X[i, j] - predicted[i, j], 2)
        return error

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, x in self.samples:
            # Computer prediction and error
            prediction = self.get_x(i, j)
            e = (x - prediction)

            # Update biases
            self.b_u[i] += self.alpha * (2 * e - self.beta * self.b_u[i])
            self.b_v[j] += self.alpha * (2 * e - self.beta * self.b_v[j])

            # Update factorization matrix U and V
            """
            If RuntimeWarning: overflow encountered in multiply,
            then turn down the learning rate alpha.
            """
            self.U[i, :] += self.alpha * (2 * e * self.V[j, :] - self.beta * self.U[i,:])
            self.V[j, :] += self.alpha * (2 * e * self.U[i, :] - self.beta * self.V[j,:])

    def get_x(self, i, j):
        """
        Get the predicted x of sample i and feature j
        """
        prediction = self.b + self.b_u[i] + self.b_v[j] + self.U[i, :].dot(self.V[j, :].T)
        return prediction

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, U and V
        """
        return self.b + self.b_u[:, np.newaxis] + self.b_v[np.newaxis, :] + self.U.dot(self.V.T)

    def replace_nan(self, X_hat):
        """
        Replace np.nan of X with the corresponding value of X_hat
        """
        X = np.copy(self.X)
        for i in range(self.num_samples):
            for j in range(self.num_features):
                if np.isnan(X[i, j]):
                    X[i, j] = X_hat[i, j]
        return X


# 读取数据并处理数据
datafile = u'E:\\桌面\\data_dict_source_7.19.xls'
data = xlrd.open_workbook(datafile)
table = data.sheets()[0]
ncols = table.ncols
nrows = table.nrows

data2 = pd.DataFrame([])
for i in range(ncols):
    data2[i] = table.col_values(i)
data2.rename(columns=data2.iloc[0, :], inplace=True)

# 设置数据输出格式
pd.set_option('display.max_columns', 20)       # 显示所有列
pd.set_option('display.max_rows', 30)          # 显示所有行
pd.set_option('max_colwidth', 100)             # 设置value的显示长度为100，默认为50

data3 = data2
# data3 = data2.drop(columns=[''])             # 删除第一列
# drop函数中加入参数inplace=True表示对原表格修改
# data2.columns  # 获取数据框列名
data4 = data3.drop([0], axis=0)                # 删除第一行

# print(data4)


def mapping_1(data_text):
    """将文本数组转换为数值数组"""
    data_text = list(data_text)
    set_text = list(set(data_text))
    data_num = pd.Series([])
    for i in range(len(data_text)):
        values = data_text[i]
        data_num[i] = set_text.index(values)
    return data_num


# 将data4含文本数据框转换为数值数据框
matrix_num = pd.DataFrame(columns=data4.columns)
for i in range(data4.shape[1]):
    matrix_num.iloc[:, i] = mapping_1(data4.iloc[:, i])

# map(mapping_1, data4)  # map映射函数
# print(matrix_num)


h = 10
xx = np.random.randint(matrix_num.shape[0], size=h)
yy = np.random.randint(matrix_num.shape[1], size=h)
for i in range(h):
    matrix_num.iloc[xx[i], yy[i]] = np.nan


if __name__ == '__main__':
    X = np.array(matrix_num, dtype=np.float)
    # replace 0 with np.nan
    print(X)
    # np.random.seed(1)
    mf = MF(X, k=5, alpha=0.1, beta=0.1, iterations=100)
    mf.train()
    X_hat = mf.full_matrix()
    X_comp = mf.replace_nan(X_hat)

    print(X_hat)
    print(X_comp)
    print(type(X))

In [None]:
#tf-idf判断文本相似度
# -*- coding: utf-8 -*-
import jieba
from gensim import corpora,models,similarities
import pandas as pd
import codecs

data=pd.read_excel(r'C:\Users\Administrator\Desktop\0731复印机（原数据-标准值).xlsx')


x=data['标配纸盒容量（原）']
y=data['标配纸盒容量_y']

data['标配纸盒容量（原）']=pd.DataFrame(data=newlist,columns=['标配纸盒容量（原）'])

doc_test = doc_test.replace('（','').replace('）','').replace(',','').replace('：','').replace(';','')
#print(doc_test)

#训练模型的数据集 
Traintest_word = []
for word in x:
    words_list = [words for words in jieba.cut(word)]   
    Traintest_word.append(words_list)

#print(Traintest_word)

#测试用词

doc_test = ("350页（纸盒：250页；旁路：100页）,东芝（TOSHIBA）A3黑白复合机e-STUDIO2309A（主机+双面器+双面送稿器+第二纸盒+工作台）")

##删除特殊字符值
doc_te = doc_test.replace('（','').replace('）','').replace(',','').replace('：','').replace(';','')
# print(doc_test)

#对测试数据进行jieba分词
doc_test_list = [word for word in jieba.cut(doc_te)]
#print(doc_test_list)

#用dictionary方法获取词袋
dictionary = corpora.Dictionary(Traintest_word)
#print(dictionary)
#词袋中用数字对所有词进行了编号
a=dictionary.keys()
#print(a)
#使用doc2bow制作语料库，利用词袋模型中的字典将其映射到向量空间
corpus = [dictionary.doc2bow(doc) for doc in Traintest_word]
#print(corpus)

#对测试文档也进行制作语料库，利用词袋模型中的字典将其映射到向量空间
doc_test_vec = dictionary.doc2bow(doc_test_list)
#print(doc_test_vec)
#使用TF-IDF模型对语料库建模
tfidf = models.TfidfModel(corpus)
#print(tfidf)
#获文档中，每个词的TF-IDF值 tfidf[corpus]
#对每个目标文档，分析测试文档的相似度
#model 实例化
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))

sim = index[tfidf[doc_test_vec]]
#print(len(sim))
#根据相似度排序是一个列表  表中每一项是一个元组   元组中前面是原句索引  后面是相似度
SimilaritiesList = sorted(enumerate(sim), key=lambda item: -item[1])
#print(SimilaritiesList)

num = 0
while(num<=5):
    Result_tutple = SimilaritiesList[num]    #获取元组   索引  相似度
    Result_index = Result_tutple[0]    #获取索引
    #print(Traintest_word[Result_index])    # 输出分词后数值
    print(Result_tutple,y[Result_index])
    num = num + 1