# Generating a dataset of concepts
- seeded on [Amazon Books Reviews](https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews) data but filtered as shown below 
- We augment Authors and Topics with Wiki crawls
- We generate fake demograhpic data to enhance users using [Telco dataset](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)
- We also generate a fake univesity relationship from a [world university rankings](https://www.kaggle.com/datasets/mylesoneill/world-university-rankings)
- We do some named entity recongnition in long texsts and do a wiki crawl for random concepts

![Data](GenDataset.png)

In [None]:
DATA_OUT = '/Users/sirsh/Downloads/monologue_dataset'
#!mkdir /Users/sirsh/Downloads/monologue_dataset

In [None]:
import pandas as pd
import numpy as np
from stringcase import snakecase
from ast import literal_eval
df = pd.read_csv("/Users/sirsh/Downloads/uniranking/timesData.csv",on_bad_lines='skip')
df = df.replace('-',-1)

#take the best without null values
top_unis = df[:250].dropna()[:200].reset_index(drop=True)
score_fields = ['teaching','international','research','citations','student_staff_ratio', 'total_score', 'income']
for f in score_fields:
    top_unis[f] = top_unis[f].map(float)
top_unis['pct_international_students'] = top_unis['international_students'].map(lambda x: float(x.replace('%','')))
top_unis['num_students'] = top_unis['num_students'].map(lambda x: int(float(x.replace(',','')))).map(int)
top_unis['world_rank'] = top_unis['world_rank'].map(int)
top_unis['year'] = top_unis['year'].map(int)
top_unis.to_feather(f"{DATA_OUT}/unis.feather")
top_unis.to_csv(f"{DATA_OUT}/unis.csv")
top_unis


In [None]:
dict(top_unis.iloc[0])

In [None]:
#considered generating names from https://www.kaggle.com/datasets/kaggle/us-baby-names but not bothered
#pd.read_csv("/Users/sirsh/Downloads/names_us/StateNames.csv")

# Amazon book reviews

In [None]:
#fetched from the mentioned dataset
df = pd.read_csv("/Users/sirsh/Downloads/amazon/books_data.csv",on_bad_lines='skip')

df = df.dropna(subset=['authors', 'categories', 'publishedDate', 'publisher'])
df['first_author'] = df['authors'].map(lambda x : literal_eval(x)[0])
df['category'] = df['categories'].map(lambda x : literal_eval(x)[0])
clean_books = df[['Title', 'description', 'first_author', 'category', 'publishedDate', 'previewLink', 'image']]
clean_books.head()

In [None]:
df = pd.read_csv("/Users/sirsh/Downloads/amazon/Books_rating.csv",on_bad_lines='skip')

reviewers_count = df[df['User_id'].notnull()].groupby('User_id').count().sort_values('Id')
frequent_reviewers = reviewers_count[reviewers_count['Id']>5].index.values

review_count = df.groupby('Title').count().sort_values('Id')
review_count = review_count.reset_index()[['Title','Id']].rename(columns={'Id': "approx_review_count"})
df = pd.merge(df, review_count, on='Title')
#keep what has been reviewed often
df = df[df['approx_review_count']>1000]
#keep what is in our list of cleaner books
df = df[df['Title'].isin(clean_books['Title'])]
#keep only records with reviewers that have reviewed more than 5 times as calc'd above
df = df[df['User_id'].isin(frequent_reviewers)]
df = df[['Id', 'Title', 'review/score', 'review/time','review/text', 'profileName' ,'User_id' ]].reset_index(drop=True)

df.columns = [snakecase(c).replace('/','_').replace('__','_').lower() for c in df.columns]
df.to_feather(f"{DATA_OUT}/amz_book_reviews_curated.feather")
df

In [None]:
#keep only what we referenced
clean_books = clean_books[clean_books['Title'].isin(df['title'].unique())].reset_index(drop=True)
clean_books.columns = [snakecase(c).replace('/','_').lower() for c in clean_books.columns]
#be careful of mixed types
clean_books['published_date'] = clean_books['published_date'].map(str)
clean_books.to_feather(f"{DATA_OUT}/amz_books_curated.feather")
clean_books

# Generating Fake Users
- we give the users mentioned in the veview some color
- a univesity
- a favourite X from some know lists 

In [None]:
categorized_book_reviews = pd.merge(df, clean_books[['title','category']], on='title')

In [None]:
user_status = categorized_book_reviews.groupby('user_id').agg({'id': len, 'review_score': [min,max,np.mean], 'profile_name':min})
user_status.columns =  ['review_count', 'review_min', 'review_max', 'review_average', 'profile_name'] 
fav_books = categorized_book_reviews.groupby('user_id')['title'].apply(lambda x: x.value_counts().idxmax())
fav_category = categorized_book_reviews.groupby('user_id')['category'].apply(lambda x: x.value_counts().idxmax())
users = user_status.join(fav_books).join(fav_category).reset_index().rename(columns={
    "title": 'favourite_book',
    "category": 'favourite_topic'
})
users['university_attended'] = top_unis['university_name'].sample(len(users),replace=True).values
users.to_feather(f"{DATA_OUT}/amz_books_reviewer_users_generated.feather")
users


# Generating Concepts
- When generating concepts we decide on specificity
- For example we can define People or Authors. Authors can be just people with a "type" 
- We had relations e.g. related concept links which is good for traversal 
- The core entity can be added to redis (without the text) or to vector stores

In [None]:
users['favourite_topic'].unique()

In [None]:
from glob import glob

list(glob(f"{DATA_OUT}/*.*"))

# How monologue ingests data with the correct schema

In [None]:
import sys
import pandas as pd
sys.path.append("../../../")

In [None]:
from monologue.core.data.stores import VectorDataStore, ColumnarDataStore
from monologue.entities.examples import TopUniversities, Books, BookReviewers, AbstractVectorStoreEntry

### first the columnar store version - simply writes to parquet under the hood

In [None]:
#ingest /merge data
ColumnarDataStore.ingest_records('/Users/sirsh/Downloads/monologue_dataset/amz_books_curated.feather', 
                                 entity_type=Books, 
                                 mode='overwrite')

In [None]:
#ask the tool questions
books_store = ColumnarDataStore(Books)
#books_store("List any books by Tolken?")

In [None]:
#ingest - this is fast using parquet only
ColumnarDataStore.ingest_records('/Users/sirsh/Downloads/monologue_dataset/unis.feather', entity_type=TopUniversities)

In [None]:
store = ColumnarDataStore(TopUniversities)
#store("What university has the top citations")

## Now the vector store type
- First we show how a generic wikipedia entry can be ingested into a "dynamic type"
- here we make sue of Pydantic to create a type that inherits from another type
- The rest works the same; ny known type can be ingested in this way - we have some pre-baked vector data types in the examples
- the other thing to know is the wikipedia data maps intp the abstract vector type since it has name and text
- the vector store type defaults some other things that can be set like id and doc_ids
- the vector embeddings is computing on add (Using LanceDB and whatever embeddings are specified in the pydantic type)

In [None]:
#use the WikiWalker to get some sample data
from monologue.core.data.clients import WikiWalker
from monologue.entities.examples import AbstractVectorStoreEntry
for record in WikiWalker().iter_sections("Philosophy"):
    break

In [None]:
#this creates a vector store entry that will be under GeneralTopics
#by inheriting from the abstract entity it is setup to wrte vector data and mebeddings
#this is a very thin pydantic descriptor and LanceDB. Check it out under the hood.
generic_topic = AbstractVectorStoreEntry.create_model("GeneralTopics")
#
generic_topic(**record)

In [None]:
store = VectorDataStore(generic_topic)
collection = [ generic_topic(**record) for record in WikiWalker().iter_sections("Philosophy")]
store.add(collection)
store

In [None]:
store("Where did the word Philosophy come from?")