In [1]:
import pysolr
from nltk.tokenize import word_tokenize
import re
import pandas
import requests

try:
    solr = pysolr.Solr('http://localhost:8983/solr/CZ4034')
    solr.ping() #test connection 
except Exception as e:
    if "Failed to connect to server" in str(e):
        print("No solr installed/running")
    elif "HTTP 404" in str(e):
        print("Core CZ4034 doesnt exist")
    else:
        print(e)

In [2]:
def import_csv(csv):
    success = True
    csv.drop_duplicates(subset=['Tweet Id'], keep='last')
    for i in range(len(csv)):
        try:
            hashtags = []
            if type(csv.iloc()[i]['Hashtags']) == "str":
                hashtags = csv.iloc()[i]['Hashtags']
                hashtags = re.sub(r"[\[\]']+", "",hashtags).replace(' ','')
                hashtags = hashtags.split(",")

            temp = {
                'tweet_id' : str(csv.iloc()[i]['Tweet Id']),
                'created_at' : csv.iloc()[i]['Datetime'].replace(' ','T').replace('+00:00','Z'),
                'username' : csv.iloc()[i]['Username'],
                'content' : csv.iloc()[i]['Text'],
                'hashtags' : hashtags,
                'likes' : int(csv.iloc()[i]['Like(s)']),
                'retweet' : int(csv.iloc()[i]['Retweet(s)']),
                'latlon' : str(csv.iloc()[i]['Latitude']) + ',' + str(csv.iloc()[i]['Longitude'])
            }

            solr.add(temp)
        except:
            success = False
            print("Cannot upload tweet", i)
    solr.commit()
    if success:
        return "Success"
    else:
        return "Failed"

In [3]:
def search(search_str, page_no = 1, result_per_page = 10):
    query_string = search_str
    page_no = page_no - 1 #default start is 0
    
    params = {
        'df': 'content',
        'rows': result_per_page,
        'start': page_no,
        'spellcheck': 'true'
    }

    search_results = solr.search(query_string, search_handler='select',**params)
    collations = []
    if search_results.spellcheck.keys():
        collations = list(filter(lambda a: a != 'collation', search_results.spellcheck['collations']))
    
    return search_results.docs, search_results.hits, collations

In [4]:
def delete_all():
    solr.delete(q="*:*")
    solr.commit()
    
def autocomplete_phase(phase):
    #return a list of tuples, tuple: [0] = term, [1] term freq
    #autocomplete = solr.suggest_terms("content", term)
    query_string = phase.rstrip()
    search_results = solr.search(query_string, search_handler='suggest')
    return search_results.raw_response['suggest']['default'][query_string]['suggestions']

def similar_search(text):
    similar = solr.more_like_this('content:'+text, mltfl='content')
    return similar.docs

In [5]:
def location_search(coordinates, search_radius = 5, result_per_page = 10):    
    params = {
        'fq': '{!geofilt}',
        'sfield': 'latlon',
        'pt': coordinates,
        'd': search_radius,
        'sort': 'geodist() asc',
        'rows': result_per_page
        #'start': page_no
    }

    search_results = solr.search("*:*", search_handler='select',**params)
    
    return search_results.docs, search_results.hits

# Below For Testing

In [6]:
tweets = pandas.read_csv (r'crawled_tweets_geo.csv')
tweets = tweets.drop_duplicates(subset=['Tweet Id'], keep='last')
print(tweets.keys())

tweets_tokens = []
for i in range(len(tweets)): 
    check_words = ' '.join(re.sub("([@#][A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweets.iloc()[i]['Text']).split())
    text_tokens = word_tokenize(re.sub(r"\S*https?:\S*", "", check_words))
    tweets_tokens = tweets_tokens + text_tokens

print("No of records:",len(tweets))
print("No of words:",len(tweets_tokens))
print("No of unique words:",len(set(tweets_tokens)))

Index(['Datetime', 'Tweet Id', 'Text', 'Username', 'Hashtags', 'Like(s)',
       'Reply(s)', 'Retweet(s)', 'Search term', 'Coordinates', 'Latitude',
       'Longitude'],
      dtype='object')
No of records: 12181
No of words: 301706
No of unique words: 22104


In [7]:
delete_all()
import_csv(tweets)

'Success'

In [9]:
pip install PySimpleGUI

Collecting PySimpleGUI
  Downloading PySimpleGUI-4.57.0-py3-none-any.whl (491 kB)
[K     |████████████████████████████████| 491 kB 4.3 MB/s eta 0:00:01
[?25hInstalling collected packages: PySimpleGUI
Successfully installed PySimpleGUI-4.57.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
import PySimpleGUI as sg
from IPython.display import clear_output

sg.theme("DarkBlue3")
sg.set_options(font=("Courier New", 16))

layout = [
    [sg.Input("")],
    [sg.Button('Cancel')]
]
window = sg.Window('Title', layout, return_keyboard_events=True, use_default_focus=False)
select = ''

while True:
    clear_output(wait=True)
    event, values = window.read()
    if event == sg.WINDOW_CLOSED or event == 'Cancel' :
        break
    elif len(event) == 1 and ord(event) == 13:
        (r1,r2,r3) = search(select)
        print(r1)
        break
    if event is not None:
        # example to use autocomplete
        # phases enclose in <b></b> is what you typed
        # only bold enclose if suggestion starts with typed words
        autocomplete = autocomplete_phase(values[0])
        if len(autocomplete)>0:
            select = autocomplete[0]["term"]
            print("suggestions")
            for au in autocomplete:
                print(au["term"])
        else:
            print("Nothing")
window.close()

suggestions
<b>k</b>yiv
<b>k</b>now
<b>k</b>yiv ukraine
<b>k</b>yiv ukraine https t.co
<b>k</b>yiv ukraine https
photo kyiv ukraine https
photo kyiv ukraine
photo kyiv
a photo kyiv ukraine
a photo kyiv


In [11]:
print(similar_search("Four years ago this morning in Kyiv / Kiev. Happier times. Stay safe my Ukrainian friends. 🇺🇦 @ Kyiv, Ukraine https://t.co/fpeqnl08lm"))

[{'created_at': '2022-03-02T11:45:28Z', 'username': 'na3ar', 'content': 'from #kyiv with love\n\n#uaразом @ Kyiv, Ukraine https://t.co/6uKjNMyyQ4', 'content_suggest': ['from #kyiv with love\n\n#uaразом @ Kyiv, Ukraine https://t.co/6uKjNMyyQ4'], 'likes': 0, 'retweet': 0, 'latlon': '50.45,30.52361111', 'id': 'dd9a3597-717b-4498-a8a9-0ceb387c073f', 'tweet_id': '1498987852192833536', '_version_': 1729162241589116928}, {'created_at': '2022-02-24T10:34:38Z', 'username': 'reigner_7', 'content': 'We are safe here #kyiv #ukraine🇺🇦 @ Kyiv, Ukraine https://t.co/9IJq8nglAD', 'content_suggest': ['We are safe here #kyiv #ukraine🇺🇦 @ Kyiv, Ukraine https://t.co/9IJq8nglAD'], 'likes': 0, 'retweet': 0, 'latlon': '50.45,30.52361111', 'id': 'a4e50ed4-211f-4270-8fbb-1f8d643a2a6a', 'tweet_id': '1496795700566372357', '_version_': 1729162230548660224}, {'created_at': '2022-03-10T10:28:14Z', 'username': 'FreeManReporter', 'content': 'Surveying damage near the #Kyiv TV tower @ Kyiv, Ukraine https://t.co/KJSuPQV

In [421]:
search("kiv russio")

([],
 0,
 ['kyiv russia', 'ki v russia', 'kyiv russi o', 'ki v russi o', 'kiev russia'])

In [427]:
location_search('50.45,30.52361111',5,281)

([{'created_at': '2022-02-24T21:36:08Z',
   'username': 'MilwaukeeAndy',
   'content': 'Just posted a photo @ Kyiv, Ukraine https://t.co/uhGTOj36bC',
   'content_suggest': ['Just posted a photo @ Kyiv, Ukraine https://t.co/uhGTOj36bC'],
   'likes': 0,
   'retweet': 0,
   'latlon': '50.45,30.52361111',
   'id': 'ec5f65b4-08f8-44b1-99da-f693f4cc2823',
   'tweet_id': '1496962172441747461',
   '_version_': 1728691931636039680},
  {'created_at': '2022-02-24T20:50:11Z',
   'username': 'krixmeister',
   'content': 'I don’t have words for the emotions I feel for my friends and colleagues in Ukraine. Please do know that the world is thinking of you, I truly hope the governments of USA and Europe do the right thing, and be safe. https://t.co/tKPh6vl6VE',
   'content_suggest': ['I don’t have words for the emotions I feel for my friends and colleagues in Ukraine. Please do know that the world is thinking of you, I truly hope the governments of USA and Europe do the right thing, and be safe. https: