# Medium Article Semantic Search by Title+Subtitle

### Load Data

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Datasets/medium_post_titles.csv", nrows=10000) # excercise whole data set
# data source: https://www.kaggle.com/datasets/nulldata/medium-post-titles


In [None]:
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False
3,equality,"""Call me Nat Love"" :Black Cowboys and the Fron...",,False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False


In [None]:
df["subtitle_truncated_flag"].value_counts()


Unnamed: 0_level_0,count
subtitle_truncated_flag,Unnamed: 1_level_1
False,6318
True,3682


### Data Cleanup

In [None]:
df.isna().sum()

Unnamed: 0,0
category,0
title,0
subtitle,0
subtitle_truncated_flag,0
title_extended,0


In [None]:
df = df.dropna()

In [None]:
df = df[~df["subtitle_truncated_flag"]]

In [None]:
df["subtitle_truncated_flag"].value_counts()

Unnamed: 0_level_0,count
subtitle_truncated_flag,Unnamed: 1_level_1
False,6211


In [None]:
df.shape

(6211, 5)

In [None]:
df['title']

Unnamed: 0,title
0,"""21 Conversations"" - A fun (and easy) game for..."
1,"""Biblical Porn"" at Mars Hill"
2,"""CISGENDER?! Is That A Disease?!"""
4,"""Can I Train my Model on Your Computer?"""
5,"""Cypherpunks and Wall Street"": The Security To..."
...,...
9994,America Lets Too Much Young Talent Go to Waste
9996,America Loves the Idea of Family Farms. That’s...
9997,America May Need to Adopt China’s Weapons to W...
9998,America May Outsmart China in 5G With AI and B...


In [None]:


df['title_extended'] = df['title'] + df['subtitle']

In [None]:
df['title_extended']

Unnamed: 0,title_extended
0,"""21 Conversations"" - A fun (and easy) game for..."
1,"""Biblical Porn"" at Mars HillAuthor and UW lect..."
2,"""CISGENDER?! Is That A Disease?!""Or, a primer ..."
4,"""Can I Train my Model on Your Computer?""How we..."
5,"""Cypherpunks and Wall Street"": The Security To..."
...,...
9994,America Lets Too Much Young Talent Go to Waste...
9996,America Loves the Idea of Family Farms. That’s...
9997,America May Need to Adopt China’s Weapons to W...
9998,America May Outsmart China in 5G With AI and B...


In [None]:
df['title_extended'][0]

'"21 Conversations" - A fun (and easy) game for teams to get to know each otherA (new?) Icebreaker game to get your team to say all the interesting stuff'

In [None]:
# df.head()
# df['category'].nunique()  # metadata
# df.shape # 6k vectors, full set in excercise

### Prep for Upsert

In [None]:
# init pinecone

# API_KEY =
# ENV =

import pinecone
from tqdm.autonotebook import tqdm # warning taken care of

pinecone.init(api_key = API_KEY, environment = ENV)


  from tqdm.autonotebook import tqdm


In [None]:
pinecone.create_index(name='medium-data', dimension=384, pod_type='s1', metric="cosine" )

In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda') # cuda or cpu

In [None]:
df['values'] = df['title_extended'].map(
    lambda x: (model.encode(x)).tolist()) # python list, 6k rows 1 min

In [None]:
df['id'] = df.reset_index(drop = 'index').index

In [None]:
df['metadata'] = df.apply(lambda x: {
    'title' : x['title'],
    'subtitle': x['subtitle'],
    'category': x['category']

}, axis=1)

In [None]:
df_upsert = df[['id', 'values', 'metadata']]

In [None]:
df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_upsert['id'] = df_upsert['id'].map(lambda x: str(x))


In [None]:
index =pinecone.Index('medium-data')

In [None]:
index.upsert_from_dataframe(df_upsert) # 6k takes 1 min

sending upsert requests:   0%|          | 0/6211 [00:00<?, ?it/s]

{'upserted_count': 6211}

### Query

In [None]:
xc = index.query((model.encode("which city is the most beautiful")).tolist(), # python list
           top_k=10,
           include_metadata=True)

In [None]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['title']}: {result['metadata']['category']} ")

0.57: 3 Places Where You Can Find Beauty: photography 
0.46: 6 Easy Reasons to Enjoy Exploring South Wales: travel 
0.45: A City That’s Better for the Blind Is Better for Everyone: accessibility 
0.45: A Shining City on a Hill: politics 
0.42: A Most Beautiful Game: sports 
0.4: 6 Literary Cities for Book Lovers To Visit This Year: travel 
0.4: Ace Hotel: A UX Case Study: ux 
0.39: A city and its architecture: cities 
0.39: Adaptive urban design: design 
0.38: Aesthetics of Being: spirituality 


In [None]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['subtitle']}: {result['metadata']['category']} ")

0.57: If you are willing to look hard enough, eventually you will see beauty in the most difficult of places.: photography 
0.46: Pembrokeshire is as beautiful as the Italian Coast.: travel 
0.45: Complete parity with the sighted may seem like an impossible goal, but maybe the only thing holding us back is a lack of imagination.: accessibility 
0.45: What does America stand for?: politics 
0.42: The World Cup gets advertising right: sports 
0.4: Combine your love for books and travel with these 6 literary cities.: travel 
0.4: Discover the city you are visting like a local: ux 
0.39: Bangalore Chapter: cities 
0.39: Choatic nature of order: design 
0.38: Examining life through a lens of beauty: spirituality 


### Excercise: Upsert all data