In [34]:
# Packages
import string
import regex as re
import zipfile
import gc
from scipy.stats import boxcox
import sys 
# other python utilities 
from collections import Counter 
from tqdm import tqdm 
import pandas as pd 
import numpy as np 
import pickle
import string
from termcolor import colored

# for text and visual analytics
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_colwidth', 50)


# for SVD

pd.set_option('display.float_format','{:.5f}'.format)
DF_PATH = "D:\Papers\Paper 3 - Recommender Systems\Recommender-systems\Files\Oct_Forth_projectType.csv"

In [35]:
df = pd.read_csv(DF_PATH)

categories_cols = []
print(colored('categorizing the following columns for less memory usage...', 'green'))
for col in df.columns:
    #print(str(col))
    cats = len(df[col].unique())
    #print(cats)
    if cats<=10:
        categories_cols.append(col)
        print(col)
        
df[categories_cols] = df[categories_cols].astype('category')

[32mcategorizing the following columns for less memory usage...[0m
Project Grade Level Category
Project Resource Category
Project Current Status
School Metro Type
School State
Donor State


In [36]:
df.columns

Index(['Project ID', 'Donation ID', 'Donor ID', 'Donation Amount',
       'Donation Received Date', 'School ID', 'Project Title',
       'Project Need Statement', 'Project Subject Category Tree',
       'Project Subject Subcategory Tree', 'Project Grade Level Category',
       'Project Resource Category', 'Project Cost', 'Project Posted Date',
       'Project Expiration Date', 'Project Current Status',
       'Project Fully Funded Date', 'School Metro Type', 'School State',
       'School Zip', 'Donor State', 'Donor Zip'],
      dtype='object')

### Cleaning

In [37]:

columns_to_keep = [
    'Project ID', 'Project Subject Subcategory Tree', 
    'Project Title', 'Project Need Statement', 'School State', 'School Metro Type',
    'School Zip', 'Project Grade Level Category'
    ] 

# create a df for projects specifications
projects_df = df[columns_to_keep].drop_duplicates(['Project ID']).reset_index(drop=True)
projects_id = projects_df['Project ID'].tolist()
print("len(projects_id) = ", len(projects_id))


# clean the text
patterns = ['Â', '']
punct = re.sub(r'[\&\.\,]', '', string.punctuation)

def clean(text):
    for pattern in patterns:
        text = re.sub(pattern, ' ', text)
    text = text.translate(str.maketrans('', '', punct))
    text = text.lower()
    text = re.sub(r' +(?![iaIA])[\w] +', ' ', text)
    text = re.sub(r' \s+', ' ', text)
    
    return text

# add them up
project_txt = []

for i in tqdm(range(len(projects_df)), position=0, leave=True):
    project_txt.append(
        clean(projects_df.loc[i, 'Project Title'] + ' & ' + projects_df.loc[i, 'Project Need Statement'] + ' & ' + projects_df.loc[i, 'Project Subject Subcategory Tree'] + ' & ' + projects_df.loc[i, 'School State'])
        )

projects_df.loc[:, 'title_needstatement_subjsubcattree_schstate'] = project_txt
print("len(project_txt) = ", len(project_txt))


100%|██████████| 860/860 [00:00<00:00, 15398.31it/s]

len(projects_id) =  860
len(project_txt) =  860





In [38]:
for i in range(len(projects_df)):
    projects_df.loc[i,'title_needstatement_subjsubcattree_len'] = len(projects_df.loc[i,'Project Title'] + projects_df.loc[i,'Project Need Statement'] + projects_df.loc[i, 'Project Subject Subcategory Tree'])
projects_df.loc[:, 'title_needstatement_subjsubcattree_schstate_len'] = projects_df.loc[:, 'title_needstatement_subjsubcattree_schstate'].apply(lambda x: len(x))


In [39]:
fig = px.histogram(
    projects_df['title_needstatement_subjsubcattree_schstate_len'],
    title="Number of Letters in Project Descriptions",
    labels = dict(x="Number of Letters", y="Count"),
    opacity=0.8
    )
fig.show()

In [40]:
fig = px.histogram(
    projects_df['title_needstatement_subjsubcattree_len'],
    title="Number of Letters in Project Descriptions",
    labels = dict(x="Number of Letters", y="Count"),
    opacity=0.8
    )
fig.show()

In [41]:
keep_col = 'title_needstatement_subjsubcattree_schstate'

projects_df = projects_df[columns_to_keep + [keep_col]]
    

In [42]:
projects_df.rename(columns={keep_col: 'project_txt'}, inplace=True)

In [43]:
projects_df.head(1)

Unnamed: 0,Project ID,Project Subject Subcategory Tree,Project Title,Project Need Statement,School State,School Metro Type,School Zip,Project Grade Level Category,project_txt
0,0029e426fd3296af4fc333580fa895fe,"Character Education, Literature & Writing","Everyone Needs an Address, Especially Maniac M...",My students need a class set of the book Mania...,Georgia,suburban,30238,Grades 3-5,"everyone needs an address, especially maniac m..."


In [44]:
PROJECTS_DF_PATH = "D:\Papers\Paper 3 - Recommender Systems\Recommender-systems\Files\projects.csv"
projects_df.to_csv(PROJECTS_DF_PATH, index=False)

### Analyze tokens

In [1]:
from sentence_transformers import SentenceTransformer, util

In [3]:
# ----------------- embeddings---------------------#
transformer_model = SentenceTransformer('paraphrase-distilroberta-base-v1') # , device= 'cuda'

In [None]:
projects_df.loc[:, 'token_length'] = projects_df.loc[:, 'project_txt'].apply (lambda x: len(transformer_model.tokenize([x])['input_ids'][0]))