In [1]:
import requests

def send_solr_request(q):
    url = "http://localhost:8983/solr/animals/query"
    
    formatted_query = f"'{q}'"
    # Define the query parameters
    params = {
        'q': q,
        "defType": "edismax",
        "qf": "Name^2.5 Features^2.0 Fun_Fact^2.0 Diet^2.0 Text^1.5 Features^2.0 Behavior^2.0",
        "pf": "Name^2.5 Features^2.0 Fun_Fact^2.0 Diet^2.0 Text^1.5 Features^2.0 Behavior^2.0",
        "mm": "3<-25%",
        "ps": 5,
        'rq': f'{{!ltr model=animals_model efi.text={formatted_query}}}',
        'fl': 'id,Name,score,[features]',
        "rows": "30"

    }

    try:
        # Send the HTTP GET request
        response = requests.get(url, params=params)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Print the response content
            # print("Response:")
            return (response.json())
        else:
            print(f"Error: {response.status_code} - {response.text}")

    except requests.RequestException as e:
        print(f"Error: {e}")

def parse_solr_response(response_json):
    # Extract the list of animals from the Solr response
    animals = []

    # Check if 'response' and 'docs' keys exist in the response
    if 'response' in response_json and 'docs' in response_json['response']:
        for doc in response_json['response']['docs']:
            # Check if 'Name' and 'score' keys exist in the document
            if 'Name' in doc and 'score' in doc:
                animal = {
                    'Name': doc['Name'],
                    '[features]': doc['[features]']
                }
                animals.append(animal)

    return animals

res = send_solr_request('natural habitat')

print (parse_solr_response(res))



[{'Name': 'Kiko Goat', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatchFun_Fact=0.0,queryMatchMigratory=0.0,queryMatchText=1.04024,originalScore=249.89532'}, {'Name': 'Asiatic Black Bear', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatchFun_Fact=0.0,queryMatchMigratory=0.0,queryMatchText=1.1510327,originalScore=3.2045345'}, {'Name': 'Pea Puffer', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatchFun_Fact=0.0,queryMatchMigratory=0.0,queryMatchText=1.0891392,originalScore=3.2302675'}, {'Name': 'Bobcat', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatchFun_Fact=0.0,queryMatchMigratory=0.0,queryMatchText=1.132777,originalScore=3.140967'}, {'Name': 'Eastern Woodrat', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatc

In [462]:
import pandas as pd
import json as simplejson

queries = ["Energetic dog breeds suited for hunting",
"North America animals that like to eat insects",
"Change the color of their skin, fur or feathers for the purpose of camouflage",
"Animals that walk in hierarchical groups or herds and how they deal with territory", 
"(NOT Birds) migrate to Mexico or migrate to America"]

for idx, q in enumerate(queries):
    res = send_solr_request(q)
    animals = parse_solr_response(res)

    df = pd.DataFrame(animals)
    # df.to_csv('queries/query{0}_results.csv'.format(idx+1), index=False)

    # drop every column except the Name 
    # df = df[df.columns[0:1]]

    df['score'] = 0.0

    # save the query results to a csv file
    # df.to_csv('queries/query{0}_score.csv'.format(idx+1), index=False)


### Criteria
- 0 - A document that does not match the query.
- 1 - A document that vaguely matches the query, is very incomplete (missing important fields, like instructions) and has no reviews. Or has very negative reviews.
- 2 - A document that partially matches the query, is incomplete. 
- 3 - A document that matches the query semantically, is reasonably complete (may miss more than two fields) and has at least one positive review.
- 4 - A document that perfectly or almost perfectly matches the query semantically, is complete or missing just one of the fields.
- 5 - A document that perfectly matches the query semantically, is complete (the recipe has a full ingredient list, steps and cook time/nutritional information).




In [463]:
## linear model using svm


import pandas as pd
import glob

result_files = glob.glob("queries/*_results_done.csv")
scores_files = glob.glob("queries/*_score_done.csv")

result_files.sort()
scores_files.sort()

inputs = pd.concat((pd.read_csv(file) for file in result_files), ignore_index=True)
scores = pd.concat((pd.read_csv(file) for file in scores_files), ignore_index=True)


#merge results_files and scores_files into a single data frame called ltr
for ind, pair in enumerate(zip(result_files, scores_files)):
    result = pd.read_csv(pair[0])
    score = pd.read_csv(pair[1])
    result['score'] = score['score']
    result['Name'] = score['Name']
    result.to_csv('global/query{0}_ltr.csv'.format(ind+1), index=False)

X = []
Y = [entry.score for entry in scores.itertuples()]


In [464]:
def get_features(entry):
    return [float(feature.split("=")[1]) for feature in entry._2.split(",")]


for entry in inputs.itertuples():

    X.append(get_features(entry))

print(X)

[[0.0, 0.0, 0.0, 0.0, 3.622953, 0.0, 5.7944894, 8.691734], [2.7420187, 0.0, 0.0, 0.0, 6.804531, 0.0, 5.6559587, 8.483938], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.592811, 8.389216], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.544828, 8.317242], [0.0, 0.0, 0.0, 0.0, 4.1125107, 0.0, 5.5396934, 8.30954], [0.0, 0.0, 0.0, 0.0, 1.9825163, 0.0, 5.4566836, 8.185026], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.408512, 8.112768], [0.0, 0.0, 0.0, 0.0, 1.8832673, 0.0, 5.3943496, 8.091524], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.3816624, 8.072494], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.277225, 7.915838], [0.0, 0.0, 0.0, 0.0, 2.4496305, 0.0, 5.2685533, 7.90283], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.235448, 7.8531713], [0.0, 0.0, 0.0, 0.0, 4.0866013, 0.0, 5.188687, 7.78303], [3.4102473, 0.0, 0.0, 0.0, 0.0, 0.0, 5.177629, 7.7664433], [0.0, 0.0, 0.0, 0.0, 10.021632, 0.0, 5.1717935, 7.75769], [0.0, 0.0, 0.0, 0.0, 2.159968, 0.0, 5.1588583, 7.738287], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.156592, 7.7348876], [3.4102473, 0.0, 0.0, 0.0, 1.7868949, 0.0, 5

In [465]:


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm, linear_model
from sklearn.metrics import r2_score

scaler = StandardScaler()

scaler.fit(X)
X = scaler.fit_transform(X)

(train_x,
 test_x,
 train_y,
 test_y) = train_test_split(X, Y, test_size=0.5)

 
# scaler.fit(X)
# X = scaler.fit_transform(X)
# best_random_state = None
# best_r2_score = -1

# for random_state in range(0, 100):
#     # Split the data using the current random state
#     train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.20, random_state=random_state)

#     # Perform any necessary preprocessing steps
#     scaler = StandardScaler()
#     scaler.fit(train_x)
#     train_x = scaler.transform(train_x)
#     test_x = scaler.transform(test_x)

#     # Train your model
#     model =  svm.LinearSVR()  # Replace YourModel with the actual model you are using
#     model.fit(train_x, train_y)

#     # Make predictions on the test set
#     predictions = model.predict(test_x)

#     # Evaluate the model using a suitable metric (e.g., R-squared score)
#     r2 = r2_score(test_y, predictions)

#     # Update the best random state if necessary
#     if r2 > best_r2_score:
#         best_r2_score = r2
#         best_random_state = random_state

# print("Best Random State:", best_random_state)
# print("Best R-squared Score:", best_r2_score)



In [466]:
from sklearn import svm, linear_model
from sklearn.metrics import r2_score

linearSVM = svm.LinearSVR()
lienarReg = linear_model.LinearRegression()

linearSVM.fit(train_x, train_y)
lienarReg.fit(train_x, train_y)

pred_svm = linearSVM.predict(test_x)
pred_reg = lienarReg.predict(test_x)

r2_score(test_y, pred_svm)

-0.1008676046736896

In [467]:
# fit with all data
linearSVM = svm.LinearSVR()
lienarReg = linear_model.LinearRegression()

linearSVM.fit(X, Y)
lienarReg.fit(X, Y)


LinearRegression()

In [468]:
## print the scores of the features in the model
print(linearSVM.coef_)

[-0.41950684  0.          0.         -0.42148981  0.16951237  0.43675539
  0.08351342  0.0835134 ]


In [469]:
import numpy as np

np.set_printoptions(suppress=True)

print(lienarReg.coef_)
print("Linear Regression R-squared Score:", r2_score(test_y, pred_reg))


[     -0.21549788      -0.00000737      -0.00001332      -0.35084062
       0.08501219       0.420853    319814.86613979 -319814.73861028]
Linear Regression R-squared Score: -0.04896168982116622


In [470]:


def send_ltr_request(q):
    url = "http://localhost:8983/solr/animals/query"
    
    # Format the query
    formatted_query = f"'{q}'"

    # Define the query parameters
    params = {
        'q': q,
        'df': 'Text',
        'rq': f'{{!ltr model=animals_model efi.text={formatted_query}}}',
        'fl': 'id,Name,score,[features]',
        "rows": "30"
    }

    try:
        # Send the HTTP GET request
        response = requests.get(url, params=params)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Print the response content
            # print("Response:")
            return (response.json())
        else:
            print(f"Error: {response.status_code} - {response.text}")

    except requests.RequestException as e:
        print(f"Error: {e}")

In [471]:
queries[4] = "-Class:Aves migrate to Mexico or migrate to America"
send_ltr_request_ = send_ltr_request("-Class:Aves migrate to Mexico or migrate to America")
parsed_response = parse_solr_response(send_ltr_request_)

for i in parsed_response:
    print(i['Name'])

Freshwater Eel
Black Footed Ferret
California Kingsnake
Rodents
Deer Mouse
Orinoco Crocodile
Mule Deer
Kinkajou
Black Bass
White Marlin
Bighorn Sheep
Monarch Butterfly
Spanish Mackerel
Atlantic Salmon
Humboldt Squid
Newt
Goliath Grouper
Salmon Shark
Needlefish
Yellowtail Snapper
White Crappie
Camel Spider
Sockeye Salmon
Eyelash Viper
Hogfish
Steelhead Salmon
Rainbow Grasshopper Dactylotum Bicolor
Mangrove Snapper
Leopard Lizard
Oleander Hawk Moth


In [472]:

## calculate to a csv file the precision at n, Recall at n, f-measures at n

def calculate_precision_at_n(relevant_query, parsed_response, n):
    relevant = 0
    for i in range(n):
        if parsed_response[i]['Name'] in relevant_query:
            relevant += 1
    return relevant/n

def calculate_recall_at_n(relevant_query, parsed_response, n):
    relevant = 0
    for i in range(n):
        if parsed_response[i]['Name'] in relevant_query:
            relevant += 1
    return relevant/len(relevant_query)

def calculate_f_measure_at_n(relevant_query, parsed_response, n):
    precision = calculate_precision_at_n(relevant_query, parsed_response, n)
    recall = calculate_recall_at_n(relevant_query, parsed_response, n)
    if (precision+recall) == 0:
        return 0
    return (2*precision*recall)/(precision+recall)

for idx, q in enumerate(queries):
    print(q)
    res = send_ltr_request(q)
    parsed_response = parse_solr_response(res)
    relevant_query = []
    with open(f"metrics/metrics_relevant_q{idx+1}.txt", 'r') as file:
        relevant_query = [line.strip() for line in file]

    ## calculate precision at n(30), recall and f-measure from 1 to 30 and save to a csv file
    precision = []
    recall = []
    f_measure = []
    N = []
    for i in range(1, 31):
        N.append(i)
        precision.append(calculate_precision_at_n(relevant_query, parsed_response, i))
        recall.append(calculate_recall_at_n(relevant_query, parsed_response, i))
        f_measure.append(calculate_f_measure_at_n(relevant_query, parsed_response, i))
    
    df = pd.DataFrame({'N': N,'Precision': precision, 'Recall': recall, 'F-Measure': f_measure})
    df.to_csv('metrics/query{0}_metrics.csv'.format(idx+1), index=False)


Energetic dog breeds suited for hunting
North America animals that like to eat insects
Change the color of their skin, fur or feathers for the purpose of camouflage


Animals that walk in hierarchical groups or herds and how they deal with territory
-Class:Aves migrate to Mexico or migrate to America


In [4]:
import pandas as pd
import json as simplejson

queries = ["Energetic dog breeds suited for hunting",
"North America animals that like to eat insects",
"Change the color of their skin, fur or feathers for the purpose of camouflage",
"Animals that walk in hierarchical groups or herds and how they deal with territory", 
"-Class:Aves migrate to Mexico or migrate to America"]

for idx, q in enumerate(queries):
    res = send_solr_request(q)
    animals = parse_solr_response(res)

    df = pd.DataFrame(animals)
    df.to_csv('global/query{0}_results.csv'.format(idx+1), index=False)

    # drop every column except the Name 
    # df = df[df.columns[0:1]]

    df['score'] = 0.0

    # save the query results to a csv file
    # df.to_csv('queries/query{0}_score.csv'.format(idx+1), index=False)
