# Notebook 2: Indexing
This notebook builds the indices that are later used within PyTerrier

In [None]:
# pip install python-terrier

In [3]:
import pandas as pd
import re
pd.set_option('display.max_colwidth', 150)
import pyterrier as pt
import numpy as np
if not pt.started():
    pt.init()

PyTerrier 0.7.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [4]:
TWEET_DF_PATH = data_path + './all_tweets_updated.csv'
AUTHOR_DF_PATH = data_path + './all_authors_updated.csv'

In [5]:
def clean_text(text):
    try:
        bio_alphanumeric = re.sub(r'[^a-zA-Z0-9]', ' ', text)
        clean_bio = re.sub("\s\s+" , " ", bio_alphanumeric)
        return clean_bio
    except:
        return None

In [6]:
tweet_df = pd.read_csv(TWEET_DF_PATH, lineterminator='\n').drop('Unnamed: 0', axis=1).reset_index(drop=False).rename(columns={'index': 'docno'})
author_df = pd.read_csv(AUTHOR_DF_PATH, lineterminator='\n').drop('Unnamed: 0', axis=1).reset_index(drop=True)

#clean tweets and bios
tweet_df['text_clean'] = tweet_df['text'].apply(clean_text)
author_df['bio_cleaned'] = author_df['author_bio'].apply(clean_text)

#prepare tweets for indexing
tweet_df["docno"] = tweet_df["docno"].apply(str)
tweet_df_index = tweet_df[['docno', 'text_clean']].copy().rename(columns={'text_clean': 'text'})

#prepare author data for indexing
author_df_index = author_df[['author_id', 'bio_cleaned']].copy().rename(columns={'bio_cleaned': 'text', 'author_id':'docno'})
author_df_index['docno'] = author_df_index['docno'].astype(str)

In [7]:
#TWEET INDEXING
index_dir = '/content/drive/Shareddrives/actual_650_project/indices/tweet_index_final'
indexer = pt.DFIndexer(index_dir, overwrite=True)
index_ref = indexer.index(tweet_df_index["text"], tweet_df_index["docno"])
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

04:39:17.099 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 477 empty documents
Number of documents: 163946
Number of terms: 100946
Number of postings: 1470196
Number of fields: 0
Number of tokens: 1553355
Field names: []
Positions:   false



In [8]:
#BIO INDEXING (Positional)
index_dir = '/content/drive/Shareddrives/actual_650_project/indices/bio_index_positions_final'
indexer = pt.DFIndexer(index_dir, overwrite=True, blocks=True)
index_ref = indexer.index(author_df_index["text"], author_df_index["docno"])
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

04:39:27.843 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 1167 empty documents
Number of documents: 9411
Number of terms: 18721
Number of postings: 67194
Number of fields: 0
Number of tokens: 69443
Field names: []
Positions:   true



In [9]:
#BIO INDEXING (Non-Positional)
index_dir = '/content/drive/Shareddrives/actual_650_project/indices/bio_index_nopositions_final'
indexer = pt.DFIndexer(index_dir, overwrite=True, blocks=False)
index_ref = indexer.index(author_df_index["text"], author_df_index["docno"])
index = pt.IndexFactory.of(index_ref)
print(index.getCollectionStatistics().toString())

04:39:36.173 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 1167 empty documents
Number of documents: 9411
Number of terms: 18721
Number of postings: 67194
Number of fields: 0
Number of tokens: 69443
Field names: []
Positions:   false

